aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit.c12
-rw-r--r--kernel/capability.c29
-rw-r--r--kernel/cgroup.c3720
-rw-r--r--kernel/cgroup_freezer.c40
-rw-r--r--kernel/compat.c212
-rw-r--r--kernel/cpu/Makefile1
-rw-r--r--kernel/cpu/idle.c144
-rw-r--r--kernel/cpuset.c264
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/events/core.c64
-rw-r--r--kernel/events/uprobes.c9
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/futex.c90
-rw-r--r--kernel/futex_compat.c2
-rw-r--r--kernel/groups.c14
-rw-r--r--kernel/hrtimer.c15
-rw-r--r--kernel/hung_task.c3
-rw-r--r--kernel/irq/chip.c48
-rw-r--r--kernel/irq/handle.c5
-rw-r--r--kernel/irq/internals.h9
-rw-r--r--kernel/irq/irqdesc.c5
-rw-r--r--kernel/irq/manage.c129
-rw-r--r--kernel/irq/proc.c8
-rw-r--r--kernel/irq_work.c6
-rw-r--r--kernel/kexec.c12
-rw-r--r--kernel/ksysfs.c2
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/locking/lockdep.c23
-rw-r--r--kernel/locking/locktorture.c452
-rw-r--r--kernel/locking/mcs_spinlock.c178
-rw-r--r--kernel/locking/mcs_spinlock.h129
-rw-r--r--kernel/locking/mutex-debug.c6
-rw-r--r--kernel/locking/mutex.c104
-rw-r--r--kernel/locking/rtmutex.c12
-rw-r--r--kernel/locking/rwsem-xadd.c4
-rw-r--r--kernel/module.c16
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/panic.c6
-rw-r--r--kernel/pid_namespace.c4
-rw-r--r--kernel/power/hibernate.c22
-rw-r--r--kernel/power/main.c4
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/qos.c18
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/power/suspend.c2
-rw-r--r--kernel/power/wakelock.c2
-rw-r--r--kernel/printk/printk.c15
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/ptrace.c4
-rw-r--r--kernel/rcu/Makefile2
-rw-r--r--kernel/rcu/rcu.h7
-rw-r--r--kernel/rcu/rcutorture.c (renamed from kernel/rcu/torture.c)1004
-rw-r--r--kernel/rcu/srcu.c11
-rw-r--r--kernel/rcu/tiny.c8
-rw-r--r--kernel/rcu/tiny_plugin.h4
-rw-r--r--kernel/rcu/tree.c80
-rw-r--r--kernel/rcu/tree.h4
-rw-r--r--kernel/rcu/tree_plugin.h19
-rw-r--r--kernel/rcu/tree_trace.c6
-rw-r--r--kernel/rcu/update.c5
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/resource.c14
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c2
-rw-r--r--kernel/sched/core.c280
-rw-r--r--kernel/sched/cpuacct.c6
-rw-r--r--kernel/sched/cputime.c20
-rw-r--r--kernel/sched/deadline.c56
-rw-r--r--kernel/sched/debug.c10
-rw-r--r--kernel/sched/fair.c600
-rw-r--r--kernel/sched/idle.c265
-rw-r--r--kernel/sched/idle_task.c25
-rw-r--r--kernel/sched/rt.c102
-rw-r--r--kernel/sched/sched.h75
-rw-r--r--kernel/sched/stats.c2
-rw-r--r--kernel/sched/stop_task.c15
-rw-r--r--kernel/seccomp.c121
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/smp.c139
-rw-r--r--kernel/softirq.c1
-rw-r--r--kernel/sys.c8
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c23
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/time/Makefile5
-rw-r--r--kernel/time/clockevents.c40
-rw-r--r--kernel/time/ntp.c5
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c106
-rw-r--r--kernel/time/tick-broadcast.c85
-rw-r--r--kernel/time/tick-common.c16
-rw-r--r--kernel/time/tick-internal.h11
-rw-r--r--kernel/time/timekeeping.c3
-rw-r--r--kernel/time/timekeeping_debug.c2
-rw-r--r--kernel/timer.c59
-rw-r--r--kernel/torture.c719
-rw-r--r--kernel/trace/Kconfig1
-rw-r--r--kernel/trace/blktrace.c23
-rw-r--r--kernel/trace/ftrace.c162
-rw-r--r--kernel/trace/ring_buffer_benchmark.c6
-rw-r--r--kernel/trace/trace.c214
-rw-r--r--kernel/trace/trace.h38
-rw-r--r--kernel/trace/trace_event_perf.c22
-rw-r--r--kernel/trace/trace_events.c36
-rw-r--r--kernel/trace/trace_export.c7
-rw-r--r--kernel/trace/trace_functions.c143
-rw-r--r--kernel/trace/trace_functions_graph.c3
-rw-r--r--kernel/trace/trace_irqsoff.c14
-rw-r--r--kernel/trace/trace_kprobe.c17
-rw-r--r--kernel/trace/trace_nop.c5
-rw-r--r--kernel/trace/trace_output.c31
-rw-r--r--kernel/trace/trace_probe.h17
-rw-r--r--kernel/trace/trace_sched_wakeup.c10
-rw-r--r--kernel/trace/trace_stack.c3
-rw-r--r--kernel/trace/trace_uprobe.c191
-rw-r--r--kernel/tracepoint.c256
-rw-r--r--kernel/up.c6
-rw-r--r--kernel/user.c3
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/watchdog.c19
-rw-r--r--kernel/workqueue.c9
124 files changed, 6229 insertions, 4860 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index bc010ee272b6..f2a8b6246ce9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -18,11 +18,13 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
18CFLAGS_REMOVE_irq_work.o = -pg 18CFLAGS_REMOVE_irq_work.o = -pg
19endif 19endif
20 20
21# cond_syscall is currently not LTO compatible
22CFLAGS_sys_ni.o = $(DISABLE_LTO)
23
21obj-y += sched/ 24obj-y += sched/
22obj-y += locking/ 25obj-y += locking/
23obj-y += power/ 26obj-y += power/
24obj-y += printk/ 27obj-y += printk/
25obj-y += cpu/
26obj-y += irq/ 28obj-y += irq/
27obj-y += rcu/ 29obj-y += rcu/
28 30
@@ -93,6 +95,7 @@ obj-$(CONFIG_PADATA) += padata.o
93obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 95obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
94obj-$(CONFIG_JUMP_LABEL) += jump_label.o 96obj-$(CONFIG_JUMP_LABEL) += jump_label.o
95obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o 97obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
98obj-$(CONFIG_TORTURE_TEST) += torture.o
96 99
97$(obj)/configs.o: $(obj)/config_data.h 100$(obj)/configs.o: $(obj)/config_data.h
98 101
diff --git a/kernel/audit.c b/kernel/audit.c
index 3392d3e0254a..95a20f3f52f1 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -608,9 +608,19 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
608 int err = 0; 608 int err = 0;
609 609
610 /* Only support the initial namespaces for now. */ 610 /* Only support the initial namespaces for now. */
611 /*
612 * We return ECONNREFUSED because it tricks userspace into thinking
613 * that audit was not configured into the kernel. Lots of users
614 * configure their PAM stack (because that's what the distro does)
615 * to reject login if unable to send messages to audit. If we return
616 * ECONNREFUSED the PAM stack thinks the kernel does not have audit
617 * configured in and will let login proceed. If we return EPERM
618 * userspace will reject all logins. This should be removed when we
619 * support non init namespaces!!
620 */
611 if ((current_user_ns() != &init_user_ns) || 621 if ((current_user_ns() != &init_user_ns) ||
612 (task_active_pid_ns(current) != &init_pid_ns)) 622 (task_active_pid_ns(current) != &init_pid_ns))
613 return -EPERM; 623 return -ECONNREFUSED;
614 624
615 switch (msg_type) { 625 switch (msg_type) {
616 case AUDIT_LIST: 626 case AUDIT_LIST:
diff --git a/kernel/capability.c b/kernel/capability.c
index 34019c57888d..a8d63df0c322 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,6 +7,8 @@
7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> 7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
8 */ 8 */
9 9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
10#include <linux/audit.h> 12#include <linux/audit.h>
11#include <linux/capability.h> 13#include <linux/capability.h>
12#include <linux/mm.h> 14#include <linux/mm.h>
@@ -42,15 +44,10 @@ __setup("no_file_caps", file_caps_disable);
42 44
43static void warn_legacy_capability_use(void) 45static void warn_legacy_capability_use(void)
44{ 46{
45 static int warned; 47 char name[sizeof(current->comm)];
46 if (!warned) { 48
47 char name[sizeof(current->comm)]; 49 pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
48 50 get_task_comm(name, current));
49 printk(KERN_INFO "warning: `%s' uses 32-bit capabilities"
50 " (legacy support in use)\n",
51 get_task_comm(name, current));
52 warned = 1;
53 }
54} 51}
55 52
56/* 53/*
@@ -71,16 +68,10 @@ static void warn_legacy_capability_use(void)
71 68
72static void warn_deprecated_v2(void) 69static void warn_deprecated_v2(void)
73{ 70{
74 static int warned; 71 char name[sizeof(current->comm)];
75 72
76 if (!warned) { 73 pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
77 char name[sizeof(current->comm)]; 74 get_task_comm(name, current));
78
79 printk(KERN_INFO "warning: `%s' uses deprecated v2"
80 " capabilities in a way that may be insecure.\n",
81 get_task_comm(name, current));
82 warned = 1;
83 }
84} 75}
85 76
86/* 77/*
@@ -380,7 +371,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
380bool ns_capable(struct user_namespace *ns, int cap) 371bool ns_capable(struct user_namespace *ns, int cap)
381{ 372{
382 if (unlikely(!cap_valid(cap))) { 373 if (unlikely(!cap_valid(cap))) {
383 printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); 374 pr_crit("capable() called with invalid cap=%u\n", cap);
384 BUG(); 375 BUG();
385 } 376 }
386 377
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 105f273b6f86..fede3d3f28ff 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -40,23 +40,20 @@
40#include <linux/proc_fs.h> 40#include <linux/proc_fs.h>
41#include <linux/rcupdate.h> 41#include <linux/rcupdate.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/backing-dev.h>
44#include <linux/slab.h> 43#include <linux/slab.h>
45#include <linux/magic.h>
46#include <linux/spinlock.h> 44#include <linux/spinlock.h>
45#include <linux/rwsem.h>
47#include <linux/string.h> 46#include <linux/string.h>
48#include <linux/sort.h> 47#include <linux/sort.h>
49#include <linux/kmod.h> 48#include <linux/kmod.h>
50#include <linux/module.h>
51#include <linux/delayacct.h> 49#include <linux/delayacct.h>
52#include <linux/cgroupstats.h> 50#include <linux/cgroupstats.h>
53#include <linux/hashtable.h> 51#include <linux/hashtable.h>
54#include <linux/namei.h>
55#include <linux/pid_namespace.h> 52#include <linux/pid_namespace.h>
56#include <linux/idr.h> 53#include <linux/idr.h>
57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 54#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58#include <linux/flex_array.h> /* used in cgroup_attach_task */
59#include <linux/kthread.h> 55#include <linux/kthread.h>
56#include <linux/delay.h>
60 57
61#include <linux/atomic.h> 58#include <linux/atomic.h>
62 59
@@ -68,43 +65,49 @@
68 */ 65 */
69#define CGROUP_PIDLIST_DESTROY_DELAY HZ 66#define CGROUP_PIDLIST_DESTROY_DELAY HZ
70 67
68#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
69 MAX_CFTYPE_NAME + 2)
70
71/*
72 * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
73 * creation/removal and hierarchy changing operations including cgroup
74 * creation, removal, css association and controller rebinding. This outer
75 * lock is needed mainly to resolve the circular dependency between kernfs
76 * active ref and cgroup_mutex. cgroup_tree_mutex nests above both.
77 */
78static DEFINE_MUTEX(cgroup_tree_mutex);
79
71/* 80/*
72 * cgroup_mutex is the master lock. Any modification to cgroup or its 81 * cgroup_mutex is the master lock. Any modification to cgroup or its
73 * hierarchy must be performed while holding it. 82 * hierarchy must be performed while holding it.
74 * 83 *
75 * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify 84 * css_set_rwsem protects task->cgroups pointer, the list of css_set
76 * cgroupfs_root of any cgroup hierarchy - subsys list, flags, 85 * objects, and the chain of tasks off each css_set.
77 * release_agent_path and so on. Modifying requires both cgroup_mutex and
78 * cgroup_root_mutex. Readers can acquire either of the two. This is to
79 * break the following locking order cycle.
80 *
81 * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
82 * B. namespace_sem -> cgroup_mutex
83 * 86 *
84 * B happens only through cgroup_show_options() and using cgroup_root_mutex 87 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
85 * breaks it. 88 * cgroup.h can use them for lockdep annotations.
86 */ 89 */
87#ifdef CONFIG_PROVE_RCU 90#ifdef CONFIG_PROVE_RCU
88DEFINE_MUTEX(cgroup_mutex); 91DEFINE_MUTEX(cgroup_mutex);
89EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ 92DECLARE_RWSEM(css_set_rwsem);
93EXPORT_SYMBOL_GPL(cgroup_mutex);
94EXPORT_SYMBOL_GPL(css_set_rwsem);
90#else 95#else
91static DEFINE_MUTEX(cgroup_mutex); 96static DEFINE_MUTEX(cgroup_mutex);
97static DECLARE_RWSEM(css_set_rwsem);
92#endif 98#endif
93 99
94static DEFINE_MUTEX(cgroup_root_mutex); 100/*
101 * Protects cgroup_subsys->release_agent_path. Modifying it also requires
102 * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
103 */
104static DEFINE_SPINLOCK(release_agent_path_lock);
95 105
96#define cgroup_assert_mutex_or_rcu_locked() \ 106#define cgroup_assert_mutexes_or_rcu_locked() \
97 rcu_lockdep_assert(rcu_read_lock_held() || \ 107 rcu_lockdep_assert(rcu_read_lock_held() || \
108 lockdep_is_held(&cgroup_tree_mutex) || \
98 lockdep_is_held(&cgroup_mutex), \ 109 lockdep_is_held(&cgroup_mutex), \
99 "cgroup_mutex or RCU read lock required"); 110 "cgroup_[tree_]mutex or RCU read lock required");
100
101#ifdef CONFIG_LOCKDEP
102#define cgroup_assert_mutex_or_root_locked() \
103 WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
104 !lockdep_is_held(&cgroup_root_mutex)))
105#else
106#define cgroup_assert_mutex_or_root_locked() do { } while (0)
107#endif
108 111
109/* 112/*
110 * cgroup destruction makes heavy use of work items and there can be a lot 113 * cgroup destruction makes heavy use of work items and there can be a lot
@@ -120,42 +123,41 @@ static struct workqueue_struct *cgroup_destroy_wq;
120 */ 123 */
121static struct workqueue_struct *cgroup_pidlist_destroy_wq; 124static struct workqueue_struct *cgroup_pidlist_destroy_wq;
122 125
123/* 126/* generate an array of cgroup subsystem pointers */
124 * Generate an array of cgroup subsystem pointers. At boot time, this is 127#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
125 * populated with the built in subsystems, and modular subsystems are 128static struct cgroup_subsys *cgroup_subsys[] = {
126 * registered after that. The mutable section of this array is protected by 129#include <linux/cgroup_subsys.h>
127 * cgroup_mutex. 130};
128 */ 131#undef SUBSYS
129#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, 132
130#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) 133/* array of cgroup subsystem names */
131static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = { 134#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
135static const char *cgroup_subsys_name[] = {
132#include <linux/cgroup_subsys.h> 136#include <linux/cgroup_subsys.h>
133}; 137};
138#undef SUBSYS
134 139
135/* 140/*
136 * The dummy hierarchy, reserved for the subsystems that are otherwise 141 * The default hierarchy, reserved for the subsystems that are otherwise
137 * unattached - it never has more than a single cgroup, and all tasks are 142 * unattached - it never has more than a single cgroup, and all tasks are
138 * part of that cgroup. 143 * part of that cgroup.
139 */ 144 */
140static struct cgroupfs_root cgroup_dummy_root; 145struct cgroup_root cgrp_dfl_root;
141 146
142/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ 147/*
143static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; 148 * The default hierarchy always exists but is hidden until mounted for the
149 * first time. This is for backward compatibility.
150 */
151static bool cgrp_dfl_root_visible;
144 152
145/* The list of hierarchy roots */ 153/* The list of hierarchy roots */
146 154
147static LIST_HEAD(cgroup_roots); 155static LIST_HEAD(cgroup_roots);
148static int cgroup_root_count; 156static int cgroup_root_count;
149 157
150/* 158/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
151 * Hierarchy ID allocation and mapping. It follows the same exclusion
152 * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
153 * writes, either for reads.
154 */
155static DEFINE_IDR(cgroup_hierarchy_idr); 159static DEFINE_IDR(cgroup_hierarchy_idr);
156 160
157static struct cgroup_name root_cgroup_name = { .name = "/" };
158
159/* 161/*
160 * Assign a monotonically increasing serial number to cgroups. It 162 * Assign a monotonically increasing serial number to cgroups. It
161 * guarantees cgroups with bigger numbers are newer than those with smaller 163 * guarantees cgroups with bigger numbers are newer than those with smaller
@@ -175,11 +177,13 @@ static int need_forkexit_callback __read_mostly;
175 177
176static struct cftype cgroup_base_files[]; 178static struct cftype cgroup_base_files[];
177 179
180static void cgroup_put(struct cgroup *cgrp);
181static int rebind_subsystems(struct cgroup_root *dst_root,
182 unsigned long ss_mask);
178static void cgroup_destroy_css_killed(struct cgroup *cgrp); 183static void cgroup_destroy_css_killed(struct cgroup *cgrp);
179static int cgroup_destroy_locked(struct cgroup *cgrp); 184static int cgroup_destroy_locked(struct cgroup *cgrp);
180static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 185static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
181 bool is_add); 186 bool is_add);
182static int cgroup_file_release(struct inode *inode, struct file *file);
183static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); 187static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
184 188
185/** 189/**
@@ -197,8 +201,9 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
197 struct cgroup_subsys *ss) 201 struct cgroup_subsys *ss)
198{ 202{
199 if (ss) 203 if (ss)
200 return rcu_dereference_check(cgrp->subsys[ss->subsys_id], 204 return rcu_dereference_check(cgrp->subsys[ss->id],
201 lockdep_is_held(&cgroup_mutex)); 205 lockdep_is_held(&cgroup_tree_mutex) ||
206 lockdep_is_held(&cgroup_mutex));
202 else 207 else
203 return &cgrp->dummy_css; 208 return &cgrp->dummy_css;
204} 209}
@@ -209,6 +214,27 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
209 return test_bit(CGRP_DEAD, &cgrp->flags); 214 return test_bit(CGRP_DEAD, &cgrp->flags);
210} 215}
211 216
217struct cgroup_subsys_state *seq_css(struct seq_file *seq)
218{
219 struct kernfs_open_file *of = seq->private;
220 struct cgroup *cgrp = of->kn->parent->priv;
221 struct cftype *cft = seq_cft(seq);
222
223 /*
224 * This is open and unprotected implementation of cgroup_css().
225 * seq_css() is only called from a kernfs file operation which has
226 * an active reference on the file. Because all the subsystem
227 * files are drained before a css is disassociated with a cgroup,
228 * the matching css from the cgroup's subsys table is guaranteed to
229 * be and stay valid until the enclosing operation is complete.
230 */
231 if (cft->ss)
232 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
233 else
234 return &cgrp->dummy_css;
235}
236EXPORT_SYMBOL_GPL(seq_css);
237
212/** 238/**
213 * cgroup_is_descendant - test ancestry 239 * cgroup_is_descendant - test ancestry
214 * @cgrp: the cgroup to be tested 240 * @cgrp: the cgroup to be tested
@@ -227,7 +253,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
227 } 253 }
228 return false; 254 return false;
229} 255}
230EXPORT_SYMBOL_GPL(cgroup_is_descendant);
231 256
232static int cgroup_is_releasable(const struct cgroup *cgrp) 257static int cgroup_is_releasable(const struct cgroup *cgrp)
233{ 258{
@@ -254,54 +279,23 @@ static int notify_on_release(const struct cgroup *cgrp)
254 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 279 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
255 if (!((css) = rcu_dereference_check( \ 280 if (!((css) = rcu_dereference_check( \
256 (cgrp)->subsys[(ssid)], \ 281 (cgrp)->subsys[(ssid)], \
282 lockdep_is_held(&cgroup_tree_mutex) || \
257 lockdep_is_held(&cgroup_mutex)))) { } \ 283 lockdep_is_held(&cgroup_mutex)))) { } \
258 else 284 else
259 285
260/** 286/**
261 * for_each_subsys - iterate all loaded cgroup subsystems 287 * for_each_subsys - iterate all enabled cgroup subsystems
262 * @ss: the iteration cursor 288 * @ss: the iteration cursor
263 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end 289 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
264 *
265 * Iterates through all loaded subsystems. Should be called under
266 * cgroup_mutex or cgroup_root_mutex.
267 */ 290 */
268#define for_each_subsys(ss, ssid) \ 291#define for_each_subsys(ss, ssid) \
269 for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \ 292 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
270 (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 293 (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
271 if (!((ss) = cgroup_subsys[(ssid)])) { } \
272 else
273 294
274/** 295/* iterate across the hierarchies */
275 * for_each_builtin_subsys - iterate all built-in cgroup subsystems 296#define for_each_root(root) \
276 * @ss: the iteration cursor
277 * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
278 *
279 * Bulit-in subsystems are always present and iteration itself doesn't
280 * require any synchronization.
281 */
282#define for_each_builtin_subsys(ss, i) \
283 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
284 (((ss) = cgroup_subsys[i]) || true); (i)++)
285
286/* iterate across the active hierarchies */
287#define for_each_active_root(root) \
288 list_for_each_entry((root), &cgroup_roots, root_list) 297 list_for_each_entry((root), &cgroup_roots, root_list)
289 298
290static inline struct cgroup *__d_cgrp(struct dentry *dentry)
291{
292 return dentry->d_fsdata;
293}
294
295static inline struct cfent *__d_cfe(struct dentry *dentry)
296{
297 return dentry->d_fsdata;
298}
299
300static inline struct cftype *__d_cft(struct dentry *dentry)
301{
302 return __d_cfe(dentry)->type;
303}
304
305/** 299/**
306 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 300 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
307 * @cgrp: the cgroup to be checked for liveness 301 * @cgrp: the cgroup to be checked for liveness
@@ -347,23 +341,23 @@ struct cgrp_cset_link {
347 struct list_head cgrp_link; 341 struct list_head cgrp_link;
348}; 342};
349 343
350/* The default css_set - used by init and its children prior to any 344/*
345 * The default css_set - used by init and its children prior to any
351 * hierarchies being mounted. It contains a pointer to the root state 346 * hierarchies being mounted. It contains a pointer to the root state
352 * for each subsystem. Also used to anchor the list of css_sets. Not 347 * for each subsystem. Also used to anchor the list of css_sets. Not
353 * reference-counted, to improve performance when child cgroups 348 * reference-counted, to improve performance when child cgroups
354 * haven't been created. 349 * haven't been created.
355 */ 350 */
351static struct css_set init_css_set = {
352 .refcount = ATOMIC_INIT(1),
353 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
354 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
355 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
356 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
357 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
358};
356 359
357static struct css_set init_css_set; 360static int css_set_count = 1; /* 1 for init_css_set */
358static struct cgrp_cset_link init_cgrp_cset_link;
359
360/*
361 * css_set_lock protects the list of css_set objects, and the chain of
362 * tasks off each css_set. Nests outside task->alloc_lock due to
363 * css_task_iter_start().
364 */
365static DEFINE_RWLOCK(css_set_lock);
366static int css_set_count;
367 361
368/* 362/*
369 * hash table for cgroup groups. This improves the performance to find 363 * hash table for cgroup groups. This improves the performance to find
@@ -386,30 +380,14 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
386 return key; 380 return key;
387} 381}
388 382
389/* 383static void put_css_set_locked(struct css_set *cset, bool taskexit)
390 * We don't maintain the lists running through each css_set to its task
391 * until after the first call to css_task_iter_start(). This reduces the
392 * fork()/exit() overhead for people who have cgroups compiled into their
393 * kernel but not actually in use.
394 */
395static int use_task_css_set_links __read_mostly;
396
397static void __put_css_set(struct css_set *cset, int taskexit)
398{ 384{
399 struct cgrp_cset_link *link, *tmp_link; 385 struct cgrp_cset_link *link, *tmp_link;
400 386
401 /* 387 lockdep_assert_held(&css_set_rwsem);
402 * Ensure that the refcount doesn't hit zero while any readers 388
403 * can see it. Similar to atomic_dec_and_lock(), but for an 389 if (!atomic_dec_and_test(&cset->refcount))
404 * rwlock
405 */
406 if (atomic_add_unless(&cset->refcount, -1, 1))
407 return;
408 write_lock(&css_set_lock);
409 if (!atomic_dec_and_test(&cset->refcount)) {
410 write_unlock(&css_set_lock);
411 return; 390 return;
412 }
413 391
414 /* This css_set is dead. unlink it and release cgroup refcounts */ 392 /* This css_set is dead. unlink it and release cgroup refcounts */
415 hash_del(&cset->hlist); 393 hash_del(&cset->hlist);
@@ -421,7 +399,7 @@ static void __put_css_set(struct css_set *cset, int taskexit)
421 list_del(&link->cset_link); 399 list_del(&link->cset_link);
422 list_del(&link->cgrp_link); 400 list_del(&link->cgrp_link);
423 401
424 /* @cgrp can't go away while we're holding css_set_lock */ 402 /* @cgrp can't go away while we're holding css_set_rwsem */
425 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { 403 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
426 if (taskexit) 404 if (taskexit)
427 set_bit(CGRP_RELEASABLE, &cgrp->flags); 405 set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -431,10 +409,24 @@ static void __put_css_set(struct css_set *cset, int taskexit)
431 kfree(link); 409 kfree(link);
432 } 410 }
433 411
434 write_unlock(&css_set_lock);
435 kfree_rcu(cset, rcu_head); 412 kfree_rcu(cset, rcu_head);
436} 413}
437 414
415static void put_css_set(struct css_set *cset, bool taskexit)
416{
417 /*
418 * Ensure that the refcount doesn't hit zero while any readers
419 * can see it. Similar to atomic_dec_and_lock(), but for an
420 * rwlock
421 */
422 if (atomic_add_unless(&cset->refcount, -1, 1))
423 return;
424
425 down_write(&css_set_rwsem);
426 put_css_set_locked(cset, taskexit);
427 up_write(&css_set_rwsem);
428}
429
438/* 430/*
439 * refcounted get/put for css_set objects 431 * refcounted get/put for css_set objects
440 */ 432 */
@@ -443,16 +435,6 @@ static inline void get_css_set(struct css_set *cset)
443 atomic_inc(&cset->refcount); 435 atomic_inc(&cset->refcount);
444} 436}
445 437
446static inline void put_css_set(struct css_set *cset)
447{
448 __put_css_set(cset, 0);
449}
450
451static inline void put_css_set_taskexit(struct css_set *cset)
452{
453 __put_css_set(cset, 1);
454}
455
456/** 438/**
457 * compare_css_sets - helper function for find_existing_css_set(). 439 * compare_css_sets - helper function for find_existing_css_set().
458 * @cset: candidate css_set being tested 440 * @cset: candidate css_set being tested
@@ -535,7 +517,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
535 struct cgroup *cgrp, 517 struct cgroup *cgrp,
536 struct cgroup_subsys_state *template[]) 518 struct cgroup_subsys_state *template[])
537{ 519{
538 struct cgroupfs_root *root = cgrp->root; 520 struct cgroup_root *root = cgrp->root;
539 struct cgroup_subsys *ss; 521 struct cgroup_subsys *ss;
540 struct css_set *cset; 522 struct css_set *cset;
541 unsigned long key; 523 unsigned long key;
@@ -547,7 +529,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
547 * won't change, so no need for locking. 529 * won't change, so no need for locking.
548 */ 530 */
549 for_each_subsys(ss, i) { 531 for_each_subsys(ss, i) {
550 if (root->subsys_mask & (1UL << i)) { 532 if (root->cgrp.subsys_mask & (1UL << i)) {
551 /* Subsystem is in this hierarchy. So we want 533 /* Subsystem is in this hierarchy. So we want
552 * the subsystem state from the new 534 * the subsystem state from the new
553 * cgroup */ 535 * cgroup */
@@ -652,11 +634,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
652 634
653 /* First see if we already have a cgroup group that matches 635 /* First see if we already have a cgroup group that matches
654 * the desired set */ 636 * the desired set */
655 read_lock(&css_set_lock); 637 down_read(&css_set_rwsem);
656 cset = find_existing_css_set(old_cset, cgrp, template); 638 cset = find_existing_css_set(old_cset, cgrp, template);
657 if (cset) 639 if (cset)
658 get_css_set(cset); 640 get_css_set(cset);
659 read_unlock(&css_set_lock); 641 up_read(&css_set_rwsem);
660 642
661 if (cset) 643 if (cset)
662 return cset; 644 return cset;
@@ -674,13 +656,16 @@ static struct css_set *find_css_set(struct css_set *old_cset,
674 atomic_set(&cset->refcount, 1); 656 atomic_set(&cset->refcount, 1);
675 INIT_LIST_HEAD(&cset->cgrp_links); 657 INIT_LIST_HEAD(&cset->cgrp_links);
676 INIT_LIST_HEAD(&cset->tasks); 658 INIT_LIST_HEAD(&cset->tasks);
659 INIT_LIST_HEAD(&cset->mg_tasks);
660 INIT_LIST_HEAD(&cset->mg_preload_node);
661 INIT_LIST_HEAD(&cset->mg_node);
677 INIT_HLIST_NODE(&cset->hlist); 662 INIT_HLIST_NODE(&cset->hlist);
678 663
679 /* Copy the set of subsystem state objects generated in 664 /* Copy the set of subsystem state objects generated in
680 * find_existing_css_set() */ 665 * find_existing_css_set() */
681 memcpy(cset->subsys, template, sizeof(cset->subsys)); 666 memcpy(cset->subsys, template, sizeof(cset->subsys));
682 667
683 write_lock(&css_set_lock); 668 down_write(&css_set_rwsem);
684 /* Add reference counts and links from the new css_set. */ 669 /* Add reference counts and links from the new css_set. */
685 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { 670 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
686 struct cgroup *c = link->cgrp; 671 struct cgroup *c = link->cgrp;
@@ -698,31 +683,105 @@ static struct css_set *find_css_set(struct css_set *old_cset,
698 key = css_set_hash(cset->subsys); 683 key = css_set_hash(cset->subsys);
699 hash_add(css_set_table, &cset->hlist, key); 684 hash_add(css_set_table, &cset->hlist, key);
700 685
701 write_unlock(&css_set_lock); 686 up_write(&css_set_rwsem);
702 687
703 return cset; 688 return cset;
704} 689}
705 690
706/* 691static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
707 * Return the cgroup for "task" from the given hierarchy. Must be
708 * called with cgroup_mutex held.
709 */
710static struct cgroup *task_cgroup_from_root(struct task_struct *task,
711 struct cgroupfs_root *root)
712{ 692{
713 struct css_set *cset; 693 struct cgroup *root_cgrp = kf_root->kn->priv;
714 struct cgroup *res = NULL; 694
695 return root_cgrp->root;
696}
697
698static int cgroup_init_root_id(struct cgroup_root *root)
699{
700 int id;
701
702 lockdep_assert_held(&cgroup_mutex);
703
704 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
705 if (id < 0)
706 return id;
707
708 root->hierarchy_id = id;
709 return 0;
710}
711
712static void cgroup_exit_root_id(struct cgroup_root *root)
713{
714 lockdep_assert_held(&cgroup_mutex);
715
716 if (root->hierarchy_id) {
717 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
718 root->hierarchy_id = 0;
719 }
720}
721
722static void cgroup_free_root(struct cgroup_root *root)
723{
724 if (root) {
725 /* hierarhcy ID shoulid already have been released */
726 WARN_ON_ONCE(root->hierarchy_id);
727
728 idr_destroy(&root->cgroup_idr);
729 kfree(root);
730 }
731}
732
733static void cgroup_destroy_root(struct cgroup_root *root)
734{
735 struct cgroup *cgrp = &root->cgrp;
736 struct cgrp_cset_link *link, *tmp_link;
737
738 mutex_lock(&cgroup_tree_mutex);
739 mutex_lock(&cgroup_mutex);
740
741 BUG_ON(atomic_read(&root->nr_cgrps));
742 BUG_ON(!list_empty(&cgrp->children));
743
744 /* Rebind all subsystems back to the default hierarchy */
745 rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask);
715 746
716 BUG_ON(!mutex_is_locked(&cgroup_mutex));
717 read_lock(&css_set_lock);
718 /* 747 /*
719 * No need to lock the task - since we hold cgroup_mutex the 748 * Release all the links from cset_links to this hierarchy's
720 * task can't change groups, so the only thing that can happen 749 * root cgroup
721 * is that it exits and its css is set back to init_css_set.
722 */ 750 */
723 cset = task_css_set(task); 751 down_write(&css_set_rwsem);
752
753 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
754 list_del(&link->cset_link);
755 list_del(&link->cgrp_link);
756 kfree(link);
757 }
758 up_write(&css_set_rwsem);
759
760 if (!list_empty(&root->root_list)) {
761 list_del(&root->root_list);
762 cgroup_root_count--;
763 }
764
765 cgroup_exit_root_id(root);
766
767 mutex_unlock(&cgroup_mutex);
768 mutex_unlock(&cgroup_tree_mutex);
769
770 kernfs_destroy_root(root->kf_root);
771 cgroup_free_root(root);
772}
773
774/* look up cgroup associated with given css_set on the specified hierarchy */
775static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
776 struct cgroup_root *root)
777{
778 struct cgroup *res = NULL;
779
780 lockdep_assert_held(&cgroup_mutex);
781 lockdep_assert_held(&css_set_rwsem);
782
724 if (cset == &init_css_set) { 783 if (cset == &init_css_set) {
725 res = &root->top_cgroup; 784 res = &root->cgrp;
726 } else { 785 } else {
727 struct cgrp_cset_link *link; 786 struct cgrp_cset_link *link;
728 787
@@ -735,16 +794,27 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
735 } 794 }
736 } 795 }
737 } 796 }
738 read_unlock(&css_set_lock); 797
739 BUG_ON(!res); 798 BUG_ON(!res);
740 return res; 799 return res;
741} 800}
742 801
743/* 802/*
744 * There is one global cgroup mutex. We also require taking 803 * Return the cgroup for "task" from the given hierarchy. Must be
745 * task_lock() when dereferencing a task's cgroup subsys pointers. 804 * called with cgroup_mutex and css_set_rwsem held.
746 * See "The task_lock() exception", at the end of this comment. 805 */
747 * 806static struct cgroup *task_cgroup_from_root(struct task_struct *task,
807 struct cgroup_root *root)
808{
809 /*
810 * No need to lock the task - since we hold cgroup_mutex the
811 * task can't change groups, so the only thing that can happen
812 * is that it exits and its css is set back to init_css_set.
813 */
814 return cset_cgroup_from_root(task_css_set(task), root);
815}
816
817/*
748 * A task must hold cgroup_mutex to modify cgroups. 818 * A task must hold cgroup_mutex to modify cgroups.
749 * 819 *
750 * Any task can increment and decrement the count field without lock. 820 * Any task can increment and decrement the count field without lock.
@@ -770,98 +840,79 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
770 * A cgroup can only be deleted if both its 'count' of using tasks 840 * A cgroup can only be deleted if both its 'count' of using tasks
771 * is zero, and its list of 'children' cgroups is empty. Since all 841 * is zero, and its list of 'children' cgroups is empty. Since all
772 * tasks in the system use _some_ cgroup, and since there is always at 842 * tasks in the system use _some_ cgroup, and since there is always at
773 * least one task in the system (init, pid == 1), therefore, top_cgroup 843 * least one task in the system (init, pid == 1), therefore, root cgroup
774 * always has either children cgroups and/or using tasks. So we don't 844 * always has either children cgroups and/or using tasks. So we don't
775 * need a special hack to ensure that top_cgroup cannot be deleted. 845 * need a special hack to ensure that root cgroup cannot be deleted.
776 *
777 * The task_lock() exception
778 *
779 * The need for this exception arises from the action of
780 * cgroup_attach_task(), which overwrites one task's cgroup pointer with
781 * another. It does so using cgroup_mutex, however there are
782 * several performance critical places that need to reference
783 * task->cgroup without the expense of grabbing a system global
784 * mutex. Therefore except as noted below, when dereferencing or, as
785 * in cgroup_attach_task(), modifying a task's cgroup pointer we use
786 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
787 * the task_struct routinely used for such matters.
788 * 846 *
789 * P.S. One more locking exception. RCU is used to guard the 847 * P.S. One more locking exception. RCU is used to guard the
790 * update of a tasks cgroup pointer by cgroup_attach_task() 848 * update of a tasks cgroup pointer by cgroup_attach_task()
791 */ 849 */
792 850
793/*
794 * A couple of forward declarations required, due to cyclic reference loop:
795 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
796 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
797 * -> cgroup_mkdir.
798 */
799
800static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
801static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
802static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); 851static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
803static const struct inode_operations cgroup_dir_inode_operations; 852static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
804static const struct file_operations proc_cgroupstats_operations; 853static const struct file_operations proc_cgroupstats_operations;
805 854
806static struct backing_dev_info cgroup_backing_dev_info = { 855static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
807 .name = "cgroup", 856 char *buf)
808 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
809};
810
811static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
812{ 857{
813 struct inode *inode = new_inode(sb); 858 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
814 859 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
815 if (inode) { 860 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
816 inode->i_ino = get_next_ino(); 861 cft->ss->name, cft->name);
817 inode->i_mode = mode; 862 else
818 inode->i_uid = current_fsuid(); 863 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
819 inode->i_gid = current_fsgid(); 864 return buf;
820 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
821 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
822 }
823 return inode;
824} 865}
825 866
826static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) 867/**
868 * cgroup_file_mode - deduce file mode of a control file
869 * @cft: the control file in question
870 *
871 * returns cft->mode if ->mode is not 0
872 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
873 * returns S_IRUGO if it has only a read handler
874 * returns S_IWUSR if it has only a write hander
875 */
876static umode_t cgroup_file_mode(const struct cftype *cft)
827{ 877{
828 struct cgroup_name *name; 878 umode_t mode = 0;
829 879
830 name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); 880 if (cft->mode)
831 if (!name) 881 return cft->mode;
832 return NULL; 882
833 strcpy(name->name, dentry->d_name.name); 883 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
834 return name; 884 mode |= S_IRUGO;
885
886 if (cft->write_u64 || cft->write_s64 || cft->write_string ||
887 cft->trigger)
888 mode |= S_IWUSR;
889
890 return mode;
835} 891}
836 892
837static void cgroup_free_fn(struct work_struct *work) 893static void cgroup_free_fn(struct work_struct *work)
838{ 894{
839 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 895 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
840 896
841 mutex_lock(&cgroup_mutex); 897 atomic_dec(&cgrp->root->nr_cgrps);
842 cgrp->root->number_of_cgroups--;
843 mutex_unlock(&cgroup_mutex);
844
845 /*
846 * We get a ref to the parent's dentry, and put the ref when
847 * this cgroup is being freed, so it's guaranteed that the
848 * parent won't be destroyed before its children.
849 */
850 dput(cgrp->parent->dentry);
851
852 /*
853 * Drop the active superblock reference that we took when we
854 * created the cgroup. This will free cgrp->root, if we are
855 * holding the last reference to @sb.
856 */
857 deactivate_super(cgrp->root->sb);
858
859 cgroup_pidlist_destroy_all(cgrp); 898 cgroup_pidlist_destroy_all(cgrp);
860 899
861 simple_xattrs_free(&cgrp->xattrs); 900 if (cgrp->parent) {
862 901 /*
863 kfree(rcu_dereference_raw(cgrp->name)); 902 * We get a ref to the parent, and put the ref when this
864 kfree(cgrp); 903 * cgroup is being freed, so it's guaranteed that the
904 * parent won't be destroyed before its children.
905 */
906 cgroup_put(cgrp->parent);
907 kernfs_put(cgrp->kn);
908 kfree(cgrp);
909 } else {
910 /*
911 * This is root cgroup's refcnt reaching zero, which
912 * indicates that the root should be released.
913 */
914 cgroup_destroy_root(cgrp->root);
915 }
865} 916}
866 917
867static void cgroup_free_rcu(struct rcu_head *head) 918static void cgroup_free_rcu(struct rcu_head *head)
@@ -872,73 +923,40 @@ static void cgroup_free_rcu(struct rcu_head *head)
872 queue_work(cgroup_destroy_wq, &cgrp->destroy_work); 923 queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
873} 924}
874 925
875static void cgroup_diput(struct dentry *dentry, struct inode *inode) 926static void cgroup_get(struct cgroup *cgrp)
876{
877 /* is dentry a directory ? if so, kfree() associated cgroup */
878 if (S_ISDIR(inode->i_mode)) {
879 struct cgroup *cgrp = dentry->d_fsdata;
880
881 BUG_ON(!(cgroup_is_dead(cgrp)));
882
883 /*
884 * XXX: cgrp->id is only used to look up css's. As cgroup
885 * and css's lifetimes will be decoupled, it should be made
886 * per-subsystem and moved to css->id so that lookups are
887 * successful until the target css is released.
888 */
889 mutex_lock(&cgroup_mutex);
890 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
891 mutex_unlock(&cgroup_mutex);
892 cgrp->id = -1;
893
894 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
895 } else {
896 struct cfent *cfe = __d_cfe(dentry);
897 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
898
899 WARN_ONCE(!list_empty(&cfe->node) &&
900 cgrp != &cgrp->root->top_cgroup,
901 "cfe still linked for %s\n", cfe->type->name);
902 simple_xattrs_free(&cfe->xattrs);
903 kfree(cfe);
904 }
905 iput(inode);
906}
907
908static void remove_dir(struct dentry *d)
909{ 927{
910 struct dentry *parent = dget(d->d_parent); 928 WARN_ON_ONCE(cgroup_is_dead(cgrp));
911 929 WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
912 d_delete(d); 930 atomic_inc(&cgrp->refcnt);
913 simple_rmdir(parent->d_inode, d);
914 dput(parent);
915} 931}
916 932
917static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 933static void cgroup_put(struct cgroup *cgrp)
918{ 934{
919 struct cfent *cfe; 935 if (!atomic_dec_and_test(&cgrp->refcnt))
920 936 return;
921 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 937 if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
922 lockdep_assert_held(&cgroup_mutex); 938 return;
923 939
924 /* 940 /*
925 * If we're doing cleanup due to failure of cgroup_create(), 941 * XXX: cgrp->id is only used to look up css's. As cgroup and
926 * the corresponding @cfe may not exist. 942 * css's lifetimes will be decoupled, it should be made
943 * per-subsystem and moved to css->id so that lookups are
944 * successful until the target css is released.
927 */ 945 */
928 list_for_each_entry(cfe, &cgrp->files, node) { 946 mutex_lock(&cgroup_mutex);
929 struct dentry *d = cfe->dentry; 947 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
948 mutex_unlock(&cgroup_mutex);
949 cgrp->id = -1;
930 950
931 if (cft && cfe->type != cft) 951 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
932 continue; 952}
933 953
934 dget(d); 954static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
935 d_delete(d); 955{
936 simple_unlink(cgrp->dentry->d_inode, d); 956 char name[CGROUP_FILE_NAME_MAX];
937 list_del_init(&cfe->node);
938 dput(d);
939 957
940 break; 958 lockdep_assert_held(&cgroup_tree_mutex);
941 } 959 kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
942} 960}
943 961
944/** 962/**
@@ -952,144 +970,106 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
952 int i; 970 int i;
953 971
954 for_each_subsys(ss, i) { 972 for_each_subsys(ss, i) {
955 struct cftype_set *set; 973 struct cftype *cfts;
956 974
957 if (!test_bit(i, &subsys_mask)) 975 if (!test_bit(i, &subsys_mask))
958 continue; 976 continue;
959 list_for_each_entry(set, &ss->cftsets, node) 977 list_for_each_entry(cfts, &ss->cfts, node)
960 cgroup_addrm_files(cgrp, set->cfts, false); 978 cgroup_addrm_files(cgrp, cfts, false);
961 } 979 }
962} 980}
963 981
964/* 982static int rebind_subsystems(struct cgroup_root *dst_root,
965 * NOTE : the dentry must have been dget()'ed 983 unsigned long ss_mask)
966 */
967static void cgroup_d_remove_dir(struct dentry *dentry)
968{
969 struct dentry *parent;
970
971 parent = dentry->d_parent;
972 spin_lock(&parent->d_lock);
973 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
974 list_del_init(&dentry->d_u.d_child);
975 spin_unlock(&dentry->d_lock);
976 spin_unlock(&parent->d_lock);
977 remove_dir(dentry);
978}
979
980/*
981 * Call with cgroup_mutex held. Drops reference counts on modules, including
982 * any duplicate ones that parse_cgroupfs_options took. If this function
983 * returns an error, no reference counts are touched.
984 */
985static int rebind_subsystems(struct cgroupfs_root *root,
986 unsigned long added_mask, unsigned removed_mask)
987{ 984{
988 struct cgroup *cgrp = &root->top_cgroup;
989 struct cgroup_subsys *ss; 985 struct cgroup_subsys *ss;
990 unsigned long pinned = 0; 986 int ssid, ret;
991 int i, ret;
992 987
993 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 988 lockdep_assert_held(&cgroup_tree_mutex);
994 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 989 lockdep_assert_held(&cgroup_mutex);
995 990
996 /* Check that any added subsystems are currently free */ 991 for_each_subsys(ss, ssid) {
997 for_each_subsys(ss, i) { 992 if (!(ss_mask & (1 << ssid)))
998 if (!(added_mask & (1 << i)))
999 continue; 993 continue;
1000 994
1001 /* is the subsystem mounted elsewhere? */ 995 /* if @ss is on the dummy_root, we can always move it */
1002 if (ss->root != &cgroup_dummy_root) { 996 if (ss->root == &cgrp_dfl_root)
1003 ret = -EBUSY; 997 continue;
1004 goto out_put;
1005 }
1006 998
1007 /* pin the module */ 999 /* if @ss has non-root cgroups attached to it, can't move */
1008 if (!try_module_get(ss->module)) { 1000 if (!list_empty(&ss->root->cgrp.children))
1009 ret = -ENOENT; 1001 return -EBUSY;
1010 goto out_put;
1011 }
1012 pinned |= 1 << i;
1013 }
1014 1002
1015 /* subsys could be missing if unloaded between parsing and here */ 1003 /* can't move between two non-dummy roots either */
1016 if (added_mask != pinned) { 1004 if (dst_root != &cgrp_dfl_root)
1017 ret = -ENOENT; 1005 return -EBUSY;
1018 goto out_put;
1019 } 1006 }
1020 1007
1021 ret = cgroup_populate_dir(cgrp, added_mask); 1008 ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);
1022 if (ret) 1009 if (ret) {
1023 goto out_put; 1010 if (dst_root != &cgrp_dfl_root)
1011 return ret;
1012
1013 /*
1014 * Rebinding back to the default root is not allowed to
1015 * fail. Using both default and non-default roots should
1016 * be rare. Moving subsystems back and forth even more so.
1017 * Just warn about it and continue.
1018 */
1019 if (cgrp_dfl_root_visible) {
1020 pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",
1021 ret, ss_mask);
1022 pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");
1023 }
1024 }
1024 1025
1025 /* 1026 /*
1026 * Nothing can fail from this point on. Remove files for the 1027 * Nothing can fail from this point on. Remove files for the
1027 * removed subsystems and rebind each subsystem. 1028 * removed subsystems and rebind each subsystem.
1028 */ 1029 */
1029 cgroup_clear_dir(cgrp, removed_mask); 1030 mutex_unlock(&cgroup_mutex);
1030 1031 for_each_subsys(ss, ssid)
1031 for_each_subsys(ss, i) { 1032 if (ss_mask & (1 << ssid))
1032 unsigned long bit = 1UL << i; 1033 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1033 1034 mutex_lock(&cgroup_mutex);
1034 if (bit & added_mask) {
1035 /* We're binding this subsystem to this hierarchy */
1036 BUG_ON(cgroup_css(cgrp, ss));
1037 BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
1038 BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
1039 1035
1040 rcu_assign_pointer(cgrp->subsys[i], 1036 for_each_subsys(ss, ssid) {
1041 cgroup_css(cgroup_dummy_top, ss)); 1037 struct cgroup_root *src_root;
1042 cgroup_css(cgrp, ss)->cgroup = cgrp; 1038 struct cgroup_subsys_state *css;
1043 1039
1044 ss->root = root; 1040 if (!(ss_mask & (1 << ssid)))
1045 if (ss->bind) 1041 continue;
1046 ss->bind(cgroup_css(cgrp, ss));
1047 1042
1048 /* refcount was already taken, and we're keeping it */ 1043 src_root = ss->root;
1049 root->subsys_mask |= bit; 1044 css = cgroup_css(&src_root->cgrp, ss);
1050 } else if (bit & removed_mask) {
1051 /* We're removing this subsystem */
1052 BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
1053 BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
1054 1045
1055 if (ss->bind) 1046 WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
1056 ss->bind(cgroup_css(cgroup_dummy_top, ss));
1057 1047
1058 cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top; 1048 RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
1059 RCU_INIT_POINTER(cgrp->subsys[i], NULL); 1049 rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
1050 ss->root = dst_root;
1051 css->cgroup = &dst_root->cgrp;
1060 1052
1061 cgroup_subsys[i]->root = &cgroup_dummy_root; 1053 src_root->cgrp.subsys_mask &= ~(1 << ssid);
1054 dst_root->cgrp.subsys_mask |= 1 << ssid;
1062 1055
1063 /* subsystem is now free - drop reference on module */ 1056 if (ss->bind)
1064 module_put(ss->module); 1057 ss->bind(css);
1065 root->subsys_mask &= ~bit;
1066 }
1067 } 1058 }
1068 1059
1069 /* 1060 kernfs_activate(dst_root->cgrp.kn);
1070 * Mark @root has finished binding subsystems. @root->subsys_mask
1071 * now matches the bound subsystems.
1072 */
1073 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1074
1075 return 0; 1061 return 0;
1076
1077out_put:
1078 for_each_subsys(ss, i)
1079 if (pinned & (1 << i))
1080 module_put(ss->module);
1081 return ret;
1082} 1062}
1083 1063
1084static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) 1064static int cgroup_show_options(struct seq_file *seq,
1065 struct kernfs_root *kf_root)
1085{ 1066{
1086 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 1067 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1087 struct cgroup_subsys *ss; 1068 struct cgroup_subsys *ss;
1088 int ssid; 1069 int ssid;
1089 1070
1090 mutex_lock(&cgroup_root_mutex);
1091 for_each_subsys(ss, ssid) 1071 for_each_subsys(ss, ssid)
1092 if (root->subsys_mask & (1 << ssid)) 1072 if (root->cgrp.subsys_mask & (1 << ssid))
1093 seq_printf(seq, ",%s", ss->name); 1073 seq_printf(seq, ",%s", ss->name);
1094 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1074 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1095 seq_puts(seq, ",sane_behavior"); 1075 seq_puts(seq, ",sane_behavior");
@@ -1097,13 +1077,16 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1097 seq_puts(seq, ",noprefix"); 1077 seq_puts(seq, ",noprefix");
1098 if (root->flags & CGRP_ROOT_XATTR) 1078 if (root->flags & CGRP_ROOT_XATTR)
1099 seq_puts(seq, ",xattr"); 1079 seq_puts(seq, ",xattr");
1080
1081 spin_lock(&release_agent_path_lock);
1100 if (strlen(root->release_agent_path)) 1082 if (strlen(root->release_agent_path))
1101 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1083 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1102 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) 1084 spin_unlock(&release_agent_path_lock);
1085
1086 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1103 seq_puts(seq, ",clone_children"); 1087 seq_puts(seq, ",clone_children");
1104 if (strlen(root->name)) 1088 if (strlen(root->name))
1105 seq_printf(seq, ",name=%s", root->name); 1089 seq_printf(seq, ",name=%s", root->name);
1106 mutex_unlock(&cgroup_root_mutex);
1107 return 0; 1090 return 0;
1108} 1091}
1109 1092
@@ -1115,9 +1098,6 @@ struct cgroup_sb_opts {
1115 char *name; 1098 char *name;
1116 /* User explicitly requested empty subsystem */ 1099 /* User explicitly requested empty subsystem */
1117 bool none; 1100 bool none;
1118
1119 struct cgroupfs_root *new_root;
1120
1121}; 1101};
1122 1102
1123/* 1103/*
@@ -1137,7 +1117,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1137 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1117 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1138 1118
1139#ifdef CONFIG_CPUSETS 1119#ifdef CONFIG_CPUSETS
1140 mask = ~(1UL << cpuset_subsys_id); 1120 mask = ~(1UL << cpuset_cgrp_id);
1141#endif 1121#endif
1142 1122
1143 memset(opts, 0, sizeof(*opts)); 1123 memset(opts, 0, sizeof(*opts));
@@ -1227,30 +1207,34 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1227 return -ENOENT; 1207 return -ENOENT;
1228 } 1208 }
1229 1209
1230 /*
1231 * If the 'all' option was specified select all the subsystems,
1232 * otherwise if 'none', 'name=' and a subsystem name options
1233 * were not specified, let's default to 'all'
1234 */
1235 if (all_ss || (!one_ss && !opts->none && !opts->name))
1236 for_each_subsys(ss, i)
1237 if (!ss->disabled)
1238 set_bit(i, &opts->subsys_mask);
1239
1240 /* Consistency checks */ 1210 /* Consistency checks */
1241 1211
1242 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1212 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1243 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); 1213 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1244 1214
1245 if (opts->flags & CGRP_ROOT_NOPREFIX) { 1215 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
1246 pr_err("cgroup: sane_behavior: noprefix is not allowed\n"); 1216 opts->cpuset_clone_children || opts->release_agent ||
1217 opts->name) {
1218 pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
1247 return -EINVAL; 1219 return -EINVAL;
1248 } 1220 }
1221 } else {
1222 /*
1223 * If the 'all' option was specified select all the
1224 * subsystems, otherwise if 'none', 'name=' and a subsystem
1225 * name options were not specified, let's default to 'all'
1226 */
1227 if (all_ss || (!one_ss && !opts->none && !opts->name))
1228 for_each_subsys(ss, i)
1229 if (!ss->disabled)
1230 set_bit(i, &opts->subsys_mask);
1249 1231
1250 if (opts->cpuset_clone_children) { 1232 /*
1251 pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); 1233 * We either have to specify by name or by subsystems. (So
1234 * all empty hierarchies must have a name).
1235 */
1236 if (!opts->subsys_mask && !opts->name)
1252 return -EINVAL; 1237 return -EINVAL;
1253 }
1254 } 1238 }
1255 1239
1256 /* 1240 /*
@@ -1266,21 +1250,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1266 if (opts->subsys_mask && opts->none) 1250 if (opts->subsys_mask && opts->none)
1267 return -EINVAL; 1251 return -EINVAL;
1268 1252
1269 /*
1270 * We either have to specify by name or by subsystems. (So all
1271 * empty hierarchies must have a name).
1272 */
1273 if (!opts->subsys_mask && !opts->name)
1274 return -EINVAL;
1275
1276 return 0; 1253 return 0;
1277} 1254}
1278 1255
1279static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1256static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1280{ 1257{
1281 int ret = 0; 1258 int ret = 0;
1282 struct cgroupfs_root *root = sb->s_fs_info; 1259 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1283 struct cgroup *cgrp = &root->top_cgroup;
1284 struct cgroup_sb_opts opts; 1260 struct cgroup_sb_opts opts;
1285 unsigned long added_mask, removed_mask; 1261 unsigned long added_mask, removed_mask;
1286 1262
@@ -1289,21 +1265,20 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1289 return -EINVAL; 1265 return -EINVAL;
1290 } 1266 }
1291 1267
1292 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1268 mutex_lock(&cgroup_tree_mutex);
1293 mutex_lock(&cgroup_mutex); 1269 mutex_lock(&cgroup_mutex);
1294 mutex_lock(&cgroup_root_mutex);
1295 1270
1296 /* See what subsystems are wanted */ 1271 /* See what subsystems are wanted */
1297 ret = parse_cgroupfs_options(data, &opts); 1272 ret = parse_cgroupfs_options(data, &opts);
1298 if (ret) 1273 if (ret)
1299 goto out_unlock; 1274 goto out_unlock;
1300 1275
1301 if (opts.subsys_mask != root->subsys_mask || opts.release_agent) 1276 if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent)
1302 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1277 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1303 task_tgid_nr(current), current->comm); 1278 task_tgid_nr(current), current->comm);
1304 1279
1305 added_mask = opts.subsys_mask & ~root->subsys_mask; 1280 added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask;
1306 removed_mask = root->subsys_mask & ~opts.subsys_mask; 1281 removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask;
1307 1282
1308 /* Don't allow flags or name to change at remount */ 1283 /* Don't allow flags or name to change at remount */
1309 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || 1284 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
@@ -1316,422 +1291,331 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1316 } 1291 }
1317 1292
1318 /* remounting is not allowed for populated hierarchies */ 1293 /* remounting is not allowed for populated hierarchies */
1319 if (root->number_of_cgroups > 1) { 1294 if (!list_empty(&root->cgrp.children)) {
1320 ret = -EBUSY; 1295 ret = -EBUSY;
1321 goto out_unlock; 1296 goto out_unlock;
1322 } 1297 }
1323 1298
1324 ret = rebind_subsystems(root, added_mask, removed_mask); 1299 ret = rebind_subsystems(root, added_mask);
1325 if (ret) 1300 if (ret)
1326 goto out_unlock; 1301 goto out_unlock;
1327 1302
1328 if (opts.release_agent) 1303 rebind_subsystems(&cgrp_dfl_root, removed_mask);
1304
1305 if (opts.release_agent) {
1306 spin_lock(&release_agent_path_lock);
1329 strcpy(root->release_agent_path, opts.release_agent); 1307 strcpy(root->release_agent_path, opts.release_agent);
1308 spin_unlock(&release_agent_path_lock);
1309 }
1330 out_unlock: 1310 out_unlock:
1331 kfree(opts.release_agent); 1311 kfree(opts.release_agent);
1332 kfree(opts.name); 1312 kfree(opts.name);
1333 mutex_unlock(&cgroup_root_mutex);
1334 mutex_unlock(&cgroup_mutex); 1313 mutex_unlock(&cgroup_mutex);
1335 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1314 mutex_unlock(&cgroup_tree_mutex);
1336 return ret; 1315 return ret;
1337} 1316}
1338 1317
1339static const struct super_operations cgroup_ops = { 1318/*
1340 .statfs = simple_statfs, 1319 * To reduce the fork() overhead for systems that are not actually using
1341 .drop_inode = generic_delete_inode, 1320 * their cgroups capability, we don't maintain the lists running through
1342 .show_options = cgroup_show_options, 1321 * each css_set to its tasks until we see the list actually used - in other
1343 .remount_fs = cgroup_remount, 1322 * words after the first mount.
1344}; 1323 */
1324static bool use_task_css_set_links __read_mostly;
1325
1326static void cgroup_enable_task_cg_lists(void)
1327{
1328 struct task_struct *p, *g;
1329
1330 down_write(&css_set_rwsem);
1331
1332 if (use_task_css_set_links)
1333 goto out_unlock;
1334
1335 use_task_css_set_links = true;
1336
1337 /*
1338 * We need tasklist_lock because RCU is not safe against
1339 * while_each_thread(). Besides, a forking task that has passed
1340 * cgroup_post_fork() without seeing use_task_css_set_links = 1
1341 * is not guaranteed to have its child immediately visible in the
1342 * tasklist if we walk through it with RCU.
1343 */
1344 read_lock(&tasklist_lock);
1345 do_each_thread(g, p) {
1346 WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1347 task_css_set(p) != &init_css_set);
1348
1349 /*
1350 * We should check if the process is exiting, otherwise
1351 * it will race with cgroup_exit() in that the list
1352 * entry won't be deleted though the process has exited.
1353 * Do it while holding siglock so that we don't end up
1354 * racing against cgroup_exit().
1355 */
1356 spin_lock_irq(&p->sighand->siglock);
1357 if (!(p->flags & PF_EXITING)) {
1358 struct css_set *cset = task_css_set(p);
1359
1360 list_add(&p->cg_list, &cset->tasks);
1361 get_css_set(cset);
1362 }
1363 spin_unlock_irq(&p->sighand->siglock);
1364 } while_each_thread(g, p);
1365 read_unlock(&tasklist_lock);
1366out_unlock:
1367 up_write(&css_set_rwsem);
1368}
1345 1369
1346static void init_cgroup_housekeeping(struct cgroup *cgrp) 1370static void init_cgroup_housekeeping(struct cgroup *cgrp)
1347{ 1371{
1372 atomic_set(&cgrp->refcnt, 1);
1348 INIT_LIST_HEAD(&cgrp->sibling); 1373 INIT_LIST_HEAD(&cgrp->sibling);
1349 INIT_LIST_HEAD(&cgrp->children); 1374 INIT_LIST_HEAD(&cgrp->children);
1350 INIT_LIST_HEAD(&cgrp->files);
1351 INIT_LIST_HEAD(&cgrp->cset_links); 1375 INIT_LIST_HEAD(&cgrp->cset_links);
1352 INIT_LIST_HEAD(&cgrp->release_list); 1376 INIT_LIST_HEAD(&cgrp->release_list);
1353 INIT_LIST_HEAD(&cgrp->pidlists); 1377 INIT_LIST_HEAD(&cgrp->pidlists);
1354 mutex_init(&cgrp->pidlist_mutex); 1378 mutex_init(&cgrp->pidlist_mutex);
1355 cgrp->dummy_css.cgroup = cgrp; 1379 cgrp->dummy_css.cgroup = cgrp;
1356 simple_xattrs_init(&cgrp->xattrs);
1357} 1380}
1358 1381
1359static void init_cgroup_root(struct cgroupfs_root *root) 1382static void init_cgroup_root(struct cgroup_root *root,
1383 struct cgroup_sb_opts *opts)
1360{ 1384{
1361 struct cgroup *cgrp = &root->top_cgroup; 1385 struct cgroup *cgrp = &root->cgrp;
1362 1386
1363 INIT_LIST_HEAD(&root->root_list); 1387 INIT_LIST_HEAD(&root->root_list);
1364 root->number_of_cgroups = 1; 1388 atomic_set(&root->nr_cgrps, 1);
1365 cgrp->root = root; 1389 cgrp->root = root;
1366 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1367 init_cgroup_housekeeping(cgrp); 1390 init_cgroup_housekeeping(cgrp);
1368 idr_init(&root->cgroup_idr); 1391 idr_init(&root->cgroup_idr);
1369}
1370
1371static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
1372{
1373 int id;
1374
1375 lockdep_assert_held(&cgroup_mutex);
1376 lockdep_assert_held(&cgroup_root_mutex);
1377
1378 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
1379 GFP_KERNEL);
1380 if (id < 0)
1381 return id;
1382
1383 root->hierarchy_id = id;
1384 return 0;
1385}
1386
1387static void cgroup_exit_root_id(struct cgroupfs_root *root)
1388{
1389 lockdep_assert_held(&cgroup_mutex);
1390 lockdep_assert_held(&cgroup_root_mutex);
1391
1392 if (root->hierarchy_id) {
1393 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1394 root->hierarchy_id = 0;
1395 }
1396}
1397
1398static int cgroup_test_super(struct super_block *sb, void *data)
1399{
1400 struct cgroup_sb_opts *opts = data;
1401 struct cgroupfs_root *root = sb->s_fs_info;
1402
1403 /* If we asked for a name then it must match */
1404 if (opts->name && strcmp(opts->name, root->name))
1405 return 0;
1406
1407 /*
1408 * If we asked for subsystems (or explicitly for no
1409 * subsystems) then they must match
1410 */
1411 if ((opts->subsys_mask || opts->none)
1412 && (opts->subsys_mask != root->subsys_mask))
1413 return 0;
1414
1415 return 1;
1416}
1417
1418static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1419{
1420 struct cgroupfs_root *root;
1421 1392
1422 if (!opts->subsys_mask && !opts->none)
1423 return NULL;
1424
1425 root = kzalloc(sizeof(*root), GFP_KERNEL);
1426 if (!root)
1427 return ERR_PTR(-ENOMEM);
1428
1429 init_cgroup_root(root);
1430
1431 /*
1432 * We need to set @root->subsys_mask now so that @root can be
1433 * matched by cgroup_test_super() before it finishes
1434 * initialization; otherwise, competing mounts with the same
1435 * options may try to bind the same subsystems instead of waiting
1436 * for the first one leading to unexpected mount errors.
1437 * SUBSYS_BOUND will be set once actual binding is complete.
1438 */
1439 root->subsys_mask = opts->subsys_mask;
1440 root->flags = opts->flags; 1393 root->flags = opts->flags;
1441 if (opts->release_agent) 1394 if (opts->release_agent)
1442 strcpy(root->release_agent_path, opts->release_agent); 1395 strcpy(root->release_agent_path, opts->release_agent);
1443 if (opts->name) 1396 if (opts->name)
1444 strcpy(root->name, opts->name); 1397 strcpy(root->name, opts->name);
1445 if (opts->cpuset_clone_children) 1398 if (opts->cpuset_clone_children)
1446 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); 1399 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1447 return root;
1448} 1400}
1449 1401
1450static void cgroup_free_root(struct cgroupfs_root *root) 1402static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1451{ 1403{
1452 if (root) { 1404 LIST_HEAD(tmp_links);
1453 /* hierarhcy ID shoulid already have been released */ 1405 struct cgroup *root_cgrp = &root->cgrp;
1454 WARN_ON_ONCE(root->hierarchy_id); 1406 struct css_set *cset;
1455 1407 int i, ret;
1456 idr_destroy(&root->cgroup_idr);
1457 kfree(root);
1458 }
1459}
1460 1408
1461static int cgroup_set_super(struct super_block *sb, void *data) 1409 lockdep_assert_held(&cgroup_tree_mutex);
1462{ 1410 lockdep_assert_held(&cgroup_mutex);
1463 int ret;
1464 struct cgroup_sb_opts *opts = data;
1465 1411
1466 /* If we don't have a new root, we can't set up a new sb */ 1412 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
1467 if (!opts->new_root) 1413 if (ret < 0)
1468 return -EINVAL; 1414 goto out;
1415 root_cgrp->id = ret;
1469 1416
1470 BUG_ON(!opts->subsys_mask && !opts->none); 1417 /*
1418 * We're accessing css_set_count without locking css_set_rwsem here,
1419 * but that's OK - it can only be increased by someone holding
1420 * cgroup_lock, and that's us. The worst that can happen is that we
1421 * have some link structures left over
1422 */
1423 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1424 if (ret)
1425 goto out;
1471 1426
1472 ret = set_anon_super(sb, NULL); 1427 ret = cgroup_init_root_id(root);
1473 if (ret) 1428 if (ret)
1474 return ret; 1429 goto out;
1475 1430
1476 sb->s_fs_info = opts->new_root; 1431 root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1477 opts->new_root->sb = sb; 1432 KERNFS_ROOT_CREATE_DEACTIVATED,
1433 root_cgrp);
1434 if (IS_ERR(root->kf_root)) {
1435 ret = PTR_ERR(root->kf_root);
1436 goto exit_root_id;
1437 }
1438 root_cgrp->kn = root->kf_root->kn;
1478 1439
1479 sb->s_blocksize = PAGE_CACHE_SIZE; 1440 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1480 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 1441 if (ret)
1481 sb->s_magic = CGROUP_SUPER_MAGIC; 1442 goto destroy_root;
1482 sb->s_op = &cgroup_ops;
1483 1443
1484 return 0; 1444 ret = rebind_subsystems(root, ss_mask);
1485} 1445 if (ret)
1446 goto destroy_root;
1486 1447
1487static int cgroup_get_rootdir(struct super_block *sb) 1448 /*
1488{ 1449 * There must be no failure case after here, since rebinding takes
1489 static const struct dentry_operations cgroup_dops = { 1450 * care of subsystems' refcounts, which are explicitly dropped in
1490 .d_iput = cgroup_diput, 1451 * the failure exit path.
1491 .d_delete = always_delete_dentry, 1452 */
1492 }; 1453 list_add(&root->root_list, &cgroup_roots);
1454 cgroup_root_count++;
1493 1455
1494 struct inode *inode = 1456 /*
1495 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1457 * Link the root cgroup in this hierarchy into all the css_set
1458 * objects.
1459 */
1460 down_write(&css_set_rwsem);
1461 hash_for_each(css_set_table, i, cset, hlist)
1462 link_css_set(&tmp_links, cset, root_cgrp);
1463 up_write(&css_set_rwsem);
1496 1464
1497 if (!inode) 1465 BUG_ON(!list_empty(&root_cgrp->children));
1498 return -ENOMEM; 1466 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1499 1467
1500 inode->i_fop = &simple_dir_operations; 1468 kernfs_activate(root_cgrp->kn);
1501 inode->i_op = &cgroup_dir_inode_operations; 1469 ret = 0;
1502 /* directories start off with i_nlink == 2 (for "." entry) */ 1470 goto out;
1503 inc_nlink(inode); 1471
1504 sb->s_root = d_make_root(inode); 1472destroy_root:
1505 if (!sb->s_root) 1473 kernfs_destroy_root(root->kf_root);
1506 return -ENOMEM; 1474 root->kf_root = NULL;
1507 /* for everything else we want ->d_op set */ 1475exit_root_id:
1508 sb->s_d_op = &cgroup_dops; 1476 cgroup_exit_root_id(root);
1509 return 0; 1477out:
1478 free_cgrp_cset_links(&tmp_links);
1479 return ret;
1510} 1480}
1511 1481
1512static struct dentry *cgroup_mount(struct file_system_type *fs_type, 1482static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1513 int flags, const char *unused_dev_name, 1483 int flags, const char *unused_dev_name,
1514 void *data) 1484 void *data)
1515{ 1485{
1486 struct cgroup_root *root;
1516 struct cgroup_sb_opts opts; 1487 struct cgroup_sb_opts opts;
1517 struct cgroupfs_root *root; 1488 struct dentry *dentry;
1518 int ret = 0; 1489 int ret;
1519 struct super_block *sb;
1520 struct cgroupfs_root *new_root;
1521 struct list_head tmp_links;
1522 struct inode *inode;
1523 const struct cred *cred;
1524 1490
1525 /* First find the desired set of subsystems */ 1491 /*
1492 * The first time anyone tries to mount a cgroup, enable the list
1493 * linking each css_set to its tasks and fix up all existing tasks.
1494 */
1495 if (!use_task_css_set_links)
1496 cgroup_enable_task_cg_lists();
1497retry:
1498 mutex_lock(&cgroup_tree_mutex);
1526 mutex_lock(&cgroup_mutex); 1499 mutex_lock(&cgroup_mutex);
1500
1501 /* First find the desired set of subsystems */
1527 ret = parse_cgroupfs_options(data, &opts); 1502 ret = parse_cgroupfs_options(data, &opts);
1528 mutex_unlock(&cgroup_mutex);
1529 if (ret) 1503 if (ret)
1530 goto out_err; 1504 goto out_unlock;
1531
1532 /*
1533 * Allocate a new cgroup root. We may not need it if we're
1534 * reusing an existing hierarchy.
1535 */
1536 new_root = cgroup_root_from_opts(&opts);
1537 if (IS_ERR(new_root)) {
1538 ret = PTR_ERR(new_root);
1539 goto out_err;
1540 }
1541 opts.new_root = new_root;
1542 1505
1543 /* Locate an existing or new sb for this hierarchy */ 1506 /* look for a matching existing root */
1544 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); 1507 if (!opts.subsys_mask && !opts.none && !opts.name) {
1545 if (IS_ERR(sb)) { 1508 cgrp_dfl_root_visible = true;
1546 ret = PTR_ERR(sb); 1509 root = &cgrp_dfl_root;
1547 cgroup_free_root(opts.new_root); 1510 cgroup_get(&root->cgrp);
1548 goto out_err; 1511 ret = 0;
1512 goto out_unlock;
1549 } 1513 }
1550 1514
1551 root = sb->s_fs_info; 1515 for_each_root(root) {
1552 BUG_ON(!root); 1516 bool name_match = false;
1553 if (root == opts.new_root) {
1554 /* We used the new root structure, so this is a new hierarchy */
1555 struct cgroup *root_cgrp = &root->top_cgroup;
1556 struct cgroupfs_root *existing_root;
1557 int i;
1558 struct css_set *cset;
1559
1560 BUG_ON(sb->s_root != NULL);
1561
1562 ret = cgroup_get_rootdir(sb);
1563 if (ret)
1564 goto drop_new_super;
1565 inode = sb->s_root->d_inode;
1566
1567 mutex_lock(&inode->i_mutex);
1568 mutex_lock(&cgroup_mutex);
1569 mutex_lock(&cgroup_root_mutex);
1570
1571 ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
1572 if (ret < 0)
1573 goto unlock_drop;
1574 root_cgrp->id = ret;
1575
1576 /* Check for name clashes with existing mounts */
1577 ret = -EBUSY;
1578 if (strlen(root->name))
1579 for_each_active_root(existing_root)
1580 if (!strcmp(existing_root->name, root->name))
1581 goto unlock_drop;
1582
1583 /*
1584 * We're accessing css_set_count without locking
1585 * css_set_lock here, but that's OK - it can only be
1586 * increased by someone holding cgroup_lock, and
1587 * that's us. The worst that can happen is that we
1588 * have some link structures left over
1589 */
1590 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1591 if (ret)
1592 goto unlock_drop;
1593 1517
1594 /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ 1518 if (root == &cgrp_dfl_root)
1595 ret = cgroup_init_root_id(root, 2, 0); 1519 continue;
1596 if (ret)
1597 goto unlock_drop;
1598
1599 sb->s_root->d_fsdata = root_cgrp;
1600 root_cgrp->dentry = sb->s_root;
1601
1602 /*
1603 * We're inside get_sb() and will call lookup_one_len() to
1604 * create the root files, which doesn't work if SELinux is
1605 * in use. The following cred dancing somehow works around
1606 * it. See 2ce9738ba ("cgroupfs: use init_cred when
1607 * populating new cgroupfs mount") for more details.
1608 */
1609 cred = override_creds(&init_cred);
1610
1611 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1612 if (ret)
1613 goto rm_base_files;
1614
1615 ret = rebind_subsystems(root, root->subsys_mask, 0);
1616 if (ret)
1617 goto rm_base_files;
1618
1619 revert_creds(cred);
1620 1520
1621 /* 1521 /*
1622 * There must be no failure case after here, since rebinding 1522 * If we asked for a name then it must match. Also, if
1623 * takes care of subsystems' refcounts, which are explicitly 1523 * name matches but sybsys_mask doesn't, we should fail.
1624 * dropped in the failure exit path. 1524 * Remember whether name matched.
1625 */ 1525 */
1526 if (opts.name) {
1527 if (strcmp(opts.name, root->name))
1528 continue;
1529 name_match = true;
1530 }
1626 1531
1627 list_add(&root->root_list, &cgroup_roots);
1628 cgroup_root_count++;
1629
1630 /* Link the top cgroup in this hierarchy into all
1631 * the css_set objects */
1632 write_lock(&css_set_lock);
1633 hash_for_each(css_set_table, i, cset, hlist)
1634 link_css_set(&tmp_links, cset, root_cgrp);
1635 write_unlock(&css_set_lock);
1636
1637 free_cgrp_cset_links(&tmp_links);
1638
1639 BUG_ON(!list_empty(&root_cgrp->children));
1640 BUG_ON(root->number_of_cgroups != 1);
1641
1642 mutex_unlock(&cgroup_root_mutex);
1643 mutex_unlock(&cgroup_mutex);
1644 mutex_unlock(&inode->i_mutex);
1645 } else {
1646 /* 1532 /*
1647 * We re-used an existing hierarchy - the new root (if 1533 * If we asked for subsystems (or explicitly for no
1648 * any) is not needed 1534 * subsystems) then they must match.
1649 */ 1535 */
1650 cgroup_free_root(opts.new_root); 1536 if ((opts.subsys_mask || opts.none) &&
1537 (opts.subsys_mask != root->cgrp.subsys_mask)) {
1538 if (!name_match)
1539 continue;
1540 ret = -EBUSY;
1541 goto out_unlock;
1542 }
1651 1543
1652 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { 1544 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1653 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1545 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1654 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); 1546 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1655 ret = -EINVAL; 1547 ret = -EINVAL;
1656 goto drop_new_super; 1548 goto out_unlock;
1657 } else { 1549 } else {
1658 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1550 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1659 } 1551 }
1660 } 1552 }
1661 }
1662
1663 kfree(opts.release_agent);
1664 kfree(opts.name);
1665 return dget(sb->s_root);
1666
1667 rm_base_files:
1668 free_cgrp_cset_links(&tmp_links);
1669 cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
1670 revert_creds(cred);
1671 unlock_drop:
1672 cgroup_exit_root_id(root);
1673 mutex_unlock(&cgroup_root_mutex);
1674 mutex_unlock(&cgroup_mutex);
1675 mutex_unlock(&inode->i_mutex);
1676 drop_new_super:
1677 deactivate_locked_super(sb);
1678 out_err:
1679 kfree(opts.release_agent);
1680 kfree(opts.name);
1681 return ERR_PTR(ret);
1682}
1683
1684static void cgroup_kill_sb(struct super_block *sb)
1685{
1686 struct cgroupfs_root *root = sb->s_fs_info;
1687 struct cgroup *cgrp = &root->top_cgroup;
1688 struct cgrp_cset_link *link, *tmp_link;
1689 int ret;
1690
1691 BUG_ON(!root);
1692
1693 BUG_ON(root->number_of_cgroups != 1);
1694 BUG_ON(!list_empty(&cgrp->children));
1695 1553
1696 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1554 /*
1697 mutex_lock(&cgroup_mutex); 1555 * A root's lifetime is governed by its root cgroup. Zero
1698 mutex_lock(&cgroup_root_mutex); 1556 * ref indicate that the root is being destroyed. Wait for
1557 * destruction to complete so that the subsystems are free.
1558 * We can use wait_queue for the wait but this path is
1559 * super cold. Let's just sleep for a bit and retry.
1560 */
1561 if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
1562 mutex_unlock(&cgroup_mutex);
1563 mutex_unlock(&cgroup_tree_mutex);
1564 kfree(opts.release_agent);
1565 kfree(opts.name);
1566 msleep(10);
1567 goto retry;
1568 }
1699 1569
1700 /* Rebind all subsystems back to the default hierarchy */ 1570 ret = 0;
1701 if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { 1571 goto out_unlock;
1702 ret = rebind_subsystems(root, 0, root->subsys_mask);
1703 /* Shouldn't be able to fail ... */
1704 BUG_ON(ret);
1705 } 1572 }
1706 1573
1707 /* 1574 /*
1708 * Release all the links from cset_links to this hierarchy's 1575 * No such thing, create a new one. name= matching without subsys
1709 * root cgroup 1576 * specification is allowed for already existing hierarchies but we
1577 * can't create new one without subsys specification.
1710 */ 1578 */
1711 write_lock(&css_set_lock); 1579 if (!opts.subsys_mask && !opts.none) {
1712 1580 ret = -EINVAL;
1713 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { 1581 goto out_unlock;
1714 list_del(&link->cset_link);
1715 list_del(&link->cgrp_link);
1716 kfree(link);
1717 } 1582 }
1718 write_unlock(&css_set_lock);
1719 1583
1720 if (!list_empty(&root->root_list)) { 1584 root = kzalloc(sizeof(*root), GFP_KERNEL);
1721 list_del(&root->root_list); 1585 if (!root) {
1722 cgroup_root_count--; 1586 ret = -ENOMEM;
1587 goto out_unlock;
1723 } 1588 }
1724 1589
1725 cgroup_exit_root_id(root); 1590 init_cgroup_root(root, &opts);
1726 1591
1727 mutex_unlock(&cgroup_root_mutex); 1592 ret = cgroup_setup_root(root, opts.subsys_mask);
1593 if (ret)
1594 cgroup_free_root(root);
1595
1596out_unlock:
1728 mutex_unlock(&cgroup_mutex); 1597 mutex_unlock(&cgroup_mutex);
1729 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1598 mutex_unlock(&cgroup_tree_mutex);
1730 1599
1731 simple_xattrs_free(&cgrp->xattrs); 1600 kfree(opts.release_agent);
1601 kfree(opts.name);
1732 1602
1733 kill_litter_super(sb); 1603 if (ret)
1734 cgroup_free_root(root); 1604 return ERR_PTR(ret);
1605
1606 dentry = kernfs_mount(fs_type, flags, root->kf_root, NULL);
1607 if (IS_ERR(dentry))
1608 cgroup_put(&root->cgrp);
1609 return dentry;
1610}
1611
1612static void cgroup_kill_sb(struct super_block *sb)
1613{
1614 struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1615 struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1616
1617 cgroup_put(&root->cgrp);
1618 kernfs_kill_sb(sb);
1735} 1619}
1736 1620
1737static struct file_system_type cgroup_fs_type = { 1621static struct file_system_type cgroup_fs_type = {
@@ -1743,57 +1627,6 @@ static struct file_system_type cgroup_fs_type = {
1743static struct kobject *cgroup_kobj; 1627static struct kobject *cgroup_kobj;
1744 1628
1745/** 1629/**
1746 * cgroup_path - generate the path of a cgroup
1747 * @cgrp: the cgroup in question
1748 * @buf: the buffer to write the path into
1749 * @buflen: the length of the buffer
1750 *
1751 * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
1752 *
1753 * We can't generate cgroup path using dentry->d_name, as accessing
1754 * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
1755 * inode's i_mutex, while on the other hand cgroup_path() can be called
1756 * with some irq-safe spinlocks held.
1757 */
1758int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1759{
1760 int ret = -ENAMETOOLONG;
1761 char *start;
1762
1763 if (!cgrp->parent) {
1764 if (strlcpy(buf, "/", buflen) >= buflen)
1765 return -ENAMETOOLONG;
1766 return 0;
1767 }
1768
1769 start = buf + buflen - 1;
1770 *start = '\0';
1771
1772 rcu_read_lock();
1773 do {
1774 const char *name = cgroup_name(cgrp);
1775 int len;
1776
1777 len = strlen(name);
1778 if ((start -= len) < buf)
1779 goto out;
1780 memcpy(start, name, len);
1781
1782 if (--start < buf)
1783 goto out;
1784 *start = '/';
1785
1786 cgrp = cgrp->parent;
1787 } while (cgrp->parent);
1788 ret = 0;
1789 memmove(buf, start, buf + buflen - start);
1790out:
1791 rcu_read_unlock();
1792 return ret;
1793}
1794EXPORT_SYMBOL_GPL(cgroup_path);
1795
1796/**
1797 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy 1630 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1798 * @task: target task 1631 * @task: target task
1799 * @buf: the buffer to write the path into 1632 * @buf: the buffer to write the path into
@@ -1804,49 +1637,55 @@ EXPORT_SYMBOL_GPL(cgroup_path);
1804 * function grabs cgroup_mutex and shouldn't be used inside locks used by 1637 * function grabs cgroup_mutex and shouldn't be used inside locks used by
1805 * cgroup controller callbacks. 1638 * cgroup controller callbacks.
1806 * 1639 *
1807 * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short. 1640 * Return value is the same as kernfs_path().
1808 */ 1641 */
1809int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) 1642char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1810{ 1643{
1811 struct cgroupfs_root *root; 1644 struct cgroup_root *root;
1812 struct cgroup *cgrp; 1645 struct cgroup *cgrp;
1813 int hierarchy_id = 1, ret = 0; 1646 int hierarchy_id = 1;
1814 1647 char *path = NULL;
1815 if (buflen < 2)
1816 return -ENAMETOOLONG;
1817 1648
1818 mutex_lock(&cgroup_mutex); 1649 mutex_lock(&cgroup_mutex);
1650 down_read(&css_set_rwsem);
1819 1651
1820 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); 1652 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1821 1653
1822 if (root) { 1654 if (root) {
1823 cgrp = task_cgroup_from_root(task, root); 1655 cgrp = task_cgroup_from_root(task, root);
1824 ret = cgroup_path(cgrp, buf, buflen); 1656 path = cgroup_path(cgrp, buf, buflen);
1825 } else { 1657 } else {
1826 /* if no hierarchy exists, everyone is in "/" */ 1658 /* if no hierarchy exists, everyone is in "/" */
1827 memcpy(buf, "/", 2); 1659 if (strlcpy(buf, "/", buflen) < buflen)
1660 path = buf;
1828 } 1661 }
1829 1662
1663 up_read(&css_set_rwsem);
1830 mutex_unlock(&cgroup_mutex); 1664 mutex_unlock(&cgroup_mutex);
1831 return ret; 1665 return path;
1832} 1666}
1833EXPORT_SYMBOL_GPL(task_cgroup_path); 1667EXPORT_SYMBOL_GPL(task_cgroup_path);
1834 1668
1835/* 1669/* used to track tasks and other necessary states during migration */
1836 * Control Group taskset
1837 */
1838struct task_and_cgroup {
1839 struct task_struct *task;
1840 struct cgroup *cgrp;
1841 struct css_set *cset;
1842};
1843
1844struct cgroup_taskset { 1670struct cgroup_taskset {
1845 struct task_and_cgroup single; 1671 /* the src and dst cset list running through cset->mg_node */
1846 struct flex_array *tc_array; 1672 struct list_head src_csets;
1847 int tc_array_len; 1673 struct list_head dst_csets;
1848 int idx; 1674
1849 struct cgroup *cur_cgrp; 1675 /*
1676 * Fields for cgroup_taskset_*() iteration.
1677 *
1678 * Before migration is committed, the target migration tasks are on
1679 * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
1680 * the csets on ->dst_csets. ->csets point to either ->src_csets
1681 * or ->dst_csets depending on whether migration is committed.
1682 *
1683 * ->cur_csets and ->cur_task point to the current task position
1684 * during iteration.
1685 */
1686 struct list_head *csets;
1687 struct css_set *cur_cset;
1688 struct task_struct *cur_task;
1850}; 1689};
1851 1690
1852/** 1691/**
@@ -1857,15 +1696,11 @@ struct cgroup_taskset {
1857 */ 1696 */
1858struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) 1697struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1859{ 1698{
1860 if (tset->tc_array) { 1699 tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
1861 tset->idx = 0; 1700 tset->cur_task = NULL;
1862 return cgroup_taskset_next(tset); 1701
1863 } else { 1702 return cgroup_taskset_next(tset);
1864 tset->cur_cgrp = tset->single.cgrp;
1865 return tset->single.task;
1866 }
1867} 1703}
1868EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1869 1704
1870/** 1705/**
1871 * cgroup_taskset_next - iterate to the next task in taskset 1706 * cgroup_taskset_next - iterate to the next task in taskset
@@ -1876,48 +1711,36 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1876 */ 1711 */
1877struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) 1712struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1878{ 1713{
1879 struct task_and_cgroup *tc; 1714 struct css_set *cset = tset->cur_cset;
1715 struct task_struct *task = tset->cur_task;
1880 1716
1881 if (!tset->tc_array || tset->idx >= tset->tc_array_len) 1717 while (&cset->mg_node != tset->csets) {
1882 return NULL; 1718 if (!task)
1719 task = list_first_entry(&cset->mg_tasks,
1720 struct task_struct, cg_list);
1721 else
1722 task = list_next_entry(task, cg_list);
1883 1723
1884 tc = flex_array_get(tset->tc_array, tset->idx++); 1724 if (&task->cg_list != &cset->mg_tasks) {
1885 tset->cur_cgrp = tc->cgrp; 1725 tset->cur_cset = cset;
1886 return tc->task; 1726 tset->cur_task = task;
1887} 1727 return task;
1888EXPORT_SYMBOL_GPL(cgroup_taskset_next); 1728 }
1889 1729
1890/** 1730 cset = list_next_entry(cset, mg_node);
1891 * cgroup_taskset_cur_css - return the matching css for the current task 1731 task = NULL;
1892 * @tset: taskset of interest 1732 }
1893 * @subsys_id: the ID of the target subsystem
1894 *
1895 * Return the css for the current (last returned) task of @tset for
1896 * subsystem specified by @subsys_id. This function must be preceded by
1897 * either cgroup_taskset_first() or cgroup_taskset_next().
1898 */
1899struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
1900 int subsys_id)
1901{
1902 return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
1903}
1904EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
1905 1733
1906/** 1734 return NULL;
1907 * cgroup_taskset_size - return the number of tasks in taskset
1908 * @tset: taskset of interest
1909 */
1910int cgroup_taskset_size(struct cgroup_taskset *tset)
1911{
1912 return tset->tc_array ? tset->tc_array_len : 1;
1913} 1735}
1914EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1915 1736
1916 1737/**
1917/*
1918 * cgroup_task_migrate - move a task from one cgroup to another. 1738 * cgroup_task_migrate - move a task from one cgroup to another.
1739 * @old_cgrp; the cgroup @tsk is being migrated from
1740 * @tsk: the task being migrated
1741 * @new_cset: the new css_set @tsk is being attached to
1919 * 1742 *
1920 * Must be called with cgroup_mutex and threadgroup locked. 1743 * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
1921 */ 1744 */
1922static void cgroup_task_migrate(struct cgroup *old_cgrp, 1745static void cgroup_task_migrate(struct cgroup *old_cgrp,
1923 struct task_struct *tsk, 1746 struct task_struct *tsk,
@@ -1925,6 +1748,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1925{ 1748{
1926 struct css_set *old_cset; 1749 struct css_set *old_cset;
1927 1750
1751 lockdep_assert_held(&cgroup_mutex);
1752 lockdep_assert_held(&css_set_rwsem);
1753
1928 /* 1754 /*
1929 * We are synchronized through threadgroup_lock() against PF_EXITING 1755 * We are synchronized through threadgroup_lock() against PF_EXITING
1930 * setting such that we can't race against cgroup_exit() changing the 1756 * setting such that we can't race against cgroup_exit() changing the
@@ -1933,15 +1759,16 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1933 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1759 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1934 old_cset = task_css_set(tsk); 1760 old_cset = task_css_set(tsk);
1935 1761
1936 task_lock(tsk); 1762 get_css_set(new_cset);
1937 rcu_assign_pointer(tsk->cgroups, new_cset); 1763 rcu_assign_pointer(tsk->cgroups, new_cset);
1938 task_unlock(tsk);
1939 1764
1940 /* Update the css_set linked lists if we're using them */ 1765 /*
1941 write_lock(&css_set_lock); 1766 * Use move_tail so that cgroup_taskset_first() still returns the
1942 if (!list_empty(&tsk->cg_list)) 1767 * leader after migration. This works because cgroup_migrate()
1943 list_move(&tsk->cg_list, &new_cset->tasks); 1768 * ensures that the dst_cset of the leader is the first on the
1944 write_unlock(&css_set_lock); 1769 * tset's dst_csets list.
1770 */
1771 list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
1945 1772
1946 /* 1773 /*
1947 * We just gained a reference on old_cset by taking it from the 1774 * We just gained a reference on old_cset by taking it from the
@@ -1949,100 +1776,199 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
1949 * we're safe to drop it here; it will be freed under RCU. 1776 * we're safe to drop it here; it will be freed under RCU.
1950 */ 1777 */
1951 set_bit(CGRP_RELEASABLE, &old_cgrp->flags); 1778 set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
1952 put_css_set(old_cset); 1779 put_css_set_locked(old_cset, false);
1953} 1780}
1954 1781
1955/** 1782/**
1956 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup 1783 * cgroup_migrate_finish - cleanup after attach
1957 * @cgrp: the cgroup to attach to 1784 * @preloaded_csets: list of preloaded css_sets
1958 * @tsk: the task or the leader of the threadgroup to be attached
1959 * @threadgroup: attach the whole threadgroup?
1960 * 1785 *
1961 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take 1786 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
1962 * task_lock of @tsk or each thread in the threadgroup individually in turn. 1787 * those functions for details.
1963 */ 1788 */
1964static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, 1789static void cgroup_migrate_finish(struct list_head *preloaded_csets)
1965 bool threadgroup)
1966{ 1790{
1967 int retval, i, group_size; 1791 struct css_set *cset, *tmp_cset;
1968 struct cgroupfs_root *root = cgrp->root;
1969 struct cgroup_subsys_state *css, *failed_css = NULL;
1970 /* threadgroup list cursor and array */
1971 struct task_struct *leader = tsk;
1972 struct task_and_cgroup *tc;
1973 struct flex_array *group;
1974 struct cgroup_taskset tset = { };
1975 1792
1976 /* 1793 lockdep_assert_held(&cgroup_mutex);
1977 * step 0: in order to do expensive, possibly blocking operations for 1794
1978 * every thread, we cannot iterate the thread group list, since it needs 1795 down_write(&css_set_rwsem);
1979 * rcu or tasklist locked. instead, build an array of all threads in the 1796 list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
1980 * group - group_rwsem prevents new threads from appearing, and if 1797 cset->mg_src_cgrp = NULL;
1981 * threads exit, this will just be an over-estimate. 1798 cset->mg_dst_cset = NULL;
1982 */ 1799 list_del_init(&cset->mg_preload_node);
1983 if (threadgroup) 1800 put_css_set_locked(cset, false);
1984 group_size = get_nr_threads(tsk); 1801 }
1985 else 1802 up_write(&css_set_rwsem);
1986 group_size = 1; 1803}
1987 /* flex_array supports very large thread-groups better than kmalloc. */ 1804
1988 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); 1805/**
1989 if (!group) 1806 * cgroup_migrate_add_src - add a migration source css_set
1990 return -ENOMEM; 1807 * @src_cset: the source css_set to add
1991 /* pre-allocate to guarantee space while iterating in rcu read-side. */ 1808 * @dst_cgrp: the destination cgroup
1992 retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); 1809 * @preloaded_csets: list of preloaded css_sets
1993 if (retval) 1810 *
1994 goto out_free_group_list; 1811 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
1812 * @src_cset and add it to @preloaded_csets, which should later be cleaned
1813 * up by cgroup_migrate_finish().
1814 *
1815 * This function may be called without holding threadgroup_lock even if the
1816 * target is a process. Threads may be created and destroyed but as long
1817 * as cgroup_mutex is not dropped, no new css_set can be put into play and
1818 * the preloaded css_sets are guaranteed to cover all migrations.
1819 */
1820static void cgroup_migrate_add_src(struct css_set *src_cset,
1821 struct cgroup *dst_cgrp,
1822 struct list_head *preloaded_csets)
1823{
1824 struct cgroup *src_cgrp;
1825
1826 lockdep_assert_held(&cgroup_mutex);
1827 lockdep_assert_held(&css_set_rwsem);
1828
1829 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
1830
1831 /* nothing to do if this cset already belongs to the cgroup */
1832 if (src_cgrp == dst_cgrp)
1833 return;
1834
1835 if (!list_empty(&src_cset->mg_preload_node))
1836 return;
1837
1838 WARN_ON(src_cset->mg_src_cgrp);
1839 WARN_ON(!list_empty(&src_cset->mg_tasks));
1840 WARN_ON(!list_empty(&src_cset->mg_node));
1841
1842 src_cset->mg_src_cgrp = src_cgrp;
1843 get_css_set(src_cset);
1844 list_add(&src_cset->mg_preload_node, preloaded_csets);
1845}
1846
1847/**
1848 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
1849 * @dst_cgrp: the destination cgroup
1850 * @preloaded_csets: list of preloaded source css_sets
1851 *
1852 * Tasks are about to be moved to @dst_cgrp and all the source css_sets
1853 * have been preloaded to @preloaded_csets. This function looks up and
1854 * pins all destination css_sets, links each to its source, and put them on
1855 * @preloaded_csets.
1856 *
1857 * This function must be called after cgroup_migrate_add_src() has been
1858 * called on each migration source css_set. After migration is performed
1859 * using cgroup_migrate(), cgroup_migrate_finish() must be called on
1860 * @preloaded_csets.
1861 */
1862static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
1863 struct list_head *preloaded_csets)
1864{
1865 LIST_HEAD(csets);
1866 struct css_set *src_cset;
1867
1868 lockdep_assert_held(&cgroup_mutex);
1869
1870 /* look up the dst cset for each src cset and link it to src */
1871 list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {
1872 struct css_set *dst_cset;
1873
1874 dst_cset = find_css_set(src_cset, dst_cgrp);
1875 if (!dst_cset)
1876 goto err;
1877
1878 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
1879 src_cset->mg_dst_cset = dst_cset;
1880
1881 if (list_empty(&dst_cset->mg_preload_node))
1882 list_add(&dst_cset->mg_preload_node, &csets);
1883 else
1884 put_css_set(dst_cset, false);
1885 }
1886
1887 list_splice(&csets, preloaded_csets);
1888 return 0;
1889err:
1890 cgroup_migrate_finish(&csets);
1891 return -ENOMEM;
1892}
1893
1894/**
1895 * cgroup_migrate - migrate a process or task to a cgroup
1896 * @cgrp: the destination cgroup
1897 * @leader: the leader of the process or the task to migrate
1898 * @threadgroup: whether @leader points to the whole process or a single task
1899 *
1900 * Migrate a process or task denoted by @leader to @cgrp. If migrating a
1901 * process, the caller must be holding threadgroup_lock of @leader. The
1902 * caller is also responsible for invoking cgroup_migrate_add_src() and
1903 * cgroup_migrate_prepare_dst() on the targets before invoking this
1904 * function and following up with cgroup_migrate_finish().
1905 *
1906 * As long as a controller's ->can_attach() doesn't fail, this function is
1907 * guaranteed to succeed. This means that, excluding ->can_attach()
1908 * failure, when migrating multiple targets, the success or failure can be
1909 * decided for all targets by invoking group_migrate_prepare_dst() before
1910 * actually starting migrating.
1911 */
1912static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
1913 bool threadgroup)
1914{
1915 struct cgroup_taskset tset = {
1916 .src_csets = LIST_HEAD_INIT(tset.src_csets),
1917 .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
1918 .csets = &tset.src_csets,
1919 };
1920 struct cgroup_subsys_state *css, *failed_css = NULL;
1921 struct css_set *cset, *tmp_cset;
1922 struct task_struct *task, *tmp_task;
1923 int i, ret;
1995 1924
1996 i = 0;
1997 /* 1925 /*
1998 * Prevent freeing of tasks while we take a snapshot. Tasks that are 1926 * Prevent freeing of tasks while we take a snapshot. Tasks that are
1999 * already PF_EXITING could be freed from underneath us unless we 1927 * already PF_EXITING could be freed from underneath us unless we
2000 * take an rcu_read_lock. 1928 * take an rcu_read_lock.
2001 */ 1929 */
1930 down_write(&css_set_rwsem);
2002 rcu_read_lock(); 1931 rcu_read_lock();
1932 task = leader;
2003 do { 1933 do {
2004 struct task_and_cgroup ent; 1934 /* @task either already exited or can't exit until the end */
1935 if (task->flags & PF_EXITING)
1936 goto next;
2005 1937
2006 /* @tsk either already exited or can't exit until the end */ 1938 /* leave @task alone if post_fork() hasn't linked it yet */
2007 if (tsk->flags & PF_EXITING) 1939 if (list_empty(&task->cg_list))
2008 goto next; 1940 goto next;
2009 1941
2010 /* as per above, nr_threads may decrease, but not increase. */ 1942 cset = task_css_set(task);
2011 BUG_ON(i >= group_size); 1943 if (!cset->mg_src_cgrp)
2012 ent.task = tsk;
2013 ent.cgrp = task_cgroup_from_root(tsk, root);
2014 /* nothing to do if this task is already in the cgroup */
2015 if (ent.cgrp == cgrp)
2016 goto next; 1944 goto next;
1945
2017 /* 1946 /*
2018 * saying GFP_ATOMIC has no effect here because we did prealloc 1947 * cgroup_taskset_first() must always return the leader.
2019 * earlier, but it's good form to communicate our expectations. 1948 * Take care to avoid disturbing the ordering.
2020 */ 1949 */
2021 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 1950 list_move_tail(&task->cg_list, &cset->mg_tasks);
2022 BUG_ON(retval != 0); 1951 if (list_empty(&cset->mg_node))
2023 i++; 1952 list_add_tail(&cset->mg_node, &tset.src_csets);
1953 if (list_empty(&cset->mg_dst_cset->mg_node))
1954 list_move_tail(&cset->mg_dst_cset->mg_node,
1955 &tset.dst_csets);
2024 next: 1956 next:
2025 if (!threadgroup) 1957 if (!threadgroup)
2026 break; 1958 break;
2027 } while_each_thread(leader, tsk); 1959 } while_each_thread(leader, task);
2028 rcu_read_unlock(); 1960 rcu_read_unlock();
2029 /* remember the number of threads in the array for later. */ 1961 up_write(&css_set_rwsem);
2030 group_size = i;
2031 tset.tc_array = group;
2032 tset.tc_array_len = group_size;
2033 1962
2034 /* methods shouldn't be called if no task is actually migrating */ 1963 /* methods shouldn't be called if no task is actually migrating */
2035 retval = 0; 1964 if (list_empty(&tset.src_csets))
2036 if (!group_size) 1965 return 0;
2037 goto out_free_group_list;
2038 1966
2039 /* 1967 /* check that we can legitimately attach to the cgroup */
2040 * step 1: check that we can legitimately attach to the cgroup.
2041 */
2042 for_each_css(css, i, cgrp) { 1968 for_each_css(css, i, cgrp) {
2043 if (css->ss->can_attach) { 1969 if (css->ss->can_attach) {
2044 retval = css->ss->can_attach(css, &tset); 1970 ret = css->ss->can_attach(css, &tset);
2045 if (retval) { 1971 if (ret) {
2046 failed_css = css; 1972 failed_css = css;
2047 goto out_cancel_attach; 1973 goto out_cancel_attach;
2048 } 1974 }
@@ -2050,70 +1976,91 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2050 } 1976 }
2051 1977
2052 /* 1978 /*
2053 * step 2: make sure css_sets exist for all threads to be migrated. 1979 * Now that we're guaranteed success, proceed to move all tasks to
2054 * we use find_css_set, which allocates a new one if necessary. 1980 * the new cgroup. There are no failure cases after here, so this
1981 * is the commit point.
2055 */ 1982 */
2056 for (i = 0; i < group_size; i++) { 1983 down_write(&css_set_rwsem);
2057 struct css_set *old_cset; 1984 list_for_each_entry(cset, &tset.src_csets, mg_node) {
2058 1985 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
2059 tc = flex_array_get(group, i); 1986 cgroup_task_migrate(cset->mg_src_cgrp, task,
2060 old_cset = task_css_set(tc->task); 1987 cset->mg_dst_cset);
2061 tc->cset = find_css_set(old_cset, cgrp);
2062 if (!tc->cset) {
2063 retval = -ENOMEM;
2064 goto out_put_css_set_refs;
2065 }
2066 } 1988 }
1989 up_write(&css_set_rwsem);
2067 1990
2068 /* 1991 /*
2069 * step 3: now that we're guaranteed success wrt the css_sets, 1992 * Migration is committed, all target tasks are now on dst_csets.
2070 * proceed to move all tasks to the new cgroup. There are no 1993 * Nothing is sensitive to fork() after this point. Notify
2071 * failure cases after here, so this is the commit point. 1994 * controllers that migration is complete.
2072 */ 1995 */
2073 for (i = 0; i < group_size; i++) { 1996 tset.csets = &tset.dst_csets;
2074 tc = flex_array_get(group, i);
2075 cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
2076 }
2077 /* nothing is sensitive to fork() after this point. */
2078 1997
2079 /*
2080 * step 4: do subsystem attach callbacks.
2081 */
2082 for_each_css(css, i, cgrp) 1998 for_each_css(css, i, cgrp)
2083 if (css->ss->attach) 1999 if (css->ss->attach)
2084 css->ss->attach(css, &tset); 2000 css->ss->attach(css, &tset);
2085 2001
2086 /* 2002 ret = 0;
2087 * step 5: success! and cleanup 2003 goto out_release_tset;
2088 */ 2004
2089 retval = 0;
2090out_put_css_set_refs:
2091 if (retval) {
2092 for (i = 0; i < group_size; i++) {
2093 tc = flex_array_get(group, i);
2094 if (!tc->cset)
2095 break;
2096 put_css_set(tc->cset);
2097 }
2098 }
2099out_cancel_attach: 2005out_cancel_attach:
2100 if (retval) { 2006 for_each_css(css, i, cgrp) {
2101 for_each_css(css, i, cgrp) { 2007 if (css == failed_css)
2102 if (css == failed_css) 2008 break;
2103 break; 2009 if (css->ss->cancel_attach)
2104 if (css->ss->cancel_attach) 2010 css->ss->cancel_attach(css, &tset);
2105 css->ss->cancel_attach(css, &tset);
2106 }
2107 } 2011 }
2108out_free_group_list: 2012out_release_tset:
2109 flex_array_free(group); 2013 down_write(&css_set_rwsem);
2110 return retval; 2014 list_splice_init(&tset.dst_csets, &tset.src_csets);
2015 list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
2016 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2017 list_del_init(&cset->mg_node);
2018 }
2019 up_write(&css_set_rwsem);
2020 return ret;
2021}
2022
2023/**
2024 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2025 * @dst_cgrp: the cgroup to attach to
2026 * @leader: the task or the leader of the threadgroup to be attached
2027 * @threadgroup: attach the whole threadgroup?
2028 *
2029 * Call holding cgroup_mutex and threadgroup_lock of @leader.
2030 */
2031static int cgroup_attach_task(struct cgroup *dst_cgrp,
2032 struct task_struct *leader, bool threadgroup)
2033{
2034 LIST_HEAD(preloaded_csets);
2035 struct task_struct *task;
2036 int ret;
2037
2038 /* look up all src csets */
2039 down_read(&css_set_rwsem);
2040 rcu_read_lock();
2041 task = leader;
2042 do {
2043 cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
2044 &preloaded_csets);
2045 if (!threadgroup)
2046 break;
2047 } while_each_thread(leader, task);
2048 rcu_read_unlock();
2049 up_read(&css_set_rwsem);
2050
2051 /* prepare dst csets and commit */
2052 ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
2053 if (!ret)
2054 ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
2055
2056 cgroup_migrate_finish(&preloaded_csets);
2057 return ret;
2111} 2058}
2112 2059
2113/* 2060/*
2114 * Find the task_struct of the task to attach by vpid and pass it along to the 2061 * Find the task_struct of the task to attach by vpid and pass it along to the
2115 * function to attach either it or all tasks in its threadgroup. Will lock 2062 * function to attach either it or all tasks in its threadgroup. Will lock
2116 * cgroup_mutex and threadgroup; may take task_lock of task. 2063 * cgroup_mutex and threadgroup.
2117 */ 2064 */
2118static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2065static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2119{ 2066{
@@ -2198,12 +2145,19 @@ out_unlock_cgroup:
2198 */ 2145 */
2199int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) 2146int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2200{ 2147{
2201 struct cgroupfs_root *root; 2148 struct cgroup_root *root;
2202 int retval = 0; 2149 int retval = 0;
2203 2150
2204 mutex_lock(&cgroup_mutex); 2151 mutex_lock(&cgroup_mutex);
2205 for_each_active_root(root) { 2152 for_each_root(root) {
2206 struct cgroup *from_cgrp = task_cgroup_from_root(from, root); 2153 struct cgroup *from_cgrp;
2154
2155 if (root == &cgrp_dfl_root)
2156 continue;
2157
2158 down_read(&css_set_rwsem);
2159 from_cgrp = task_cgroup_from_root(from, root);
2160 up_read(&css_set_rwsem);
2207 2161
2208 retval = cgroup_attach_task(from_cgrp, tsk, false); 2162 retval = cgroup_attach_task(from_cgrp, tsk, false);
2209 if (retval) 2163 if (retval)
@@ -2228,16 +2182,17 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css,
2228} 2182}
2229 2183
2230static int cgroup_release_agent_write(struct cgroup_subsys_state *css, 2184static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2231 struct cftype *cft, const char *buffer) 2185 struct cftype *cft, char *buffer)
2232{ 2186{
2233 BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); 2187 struct cgroup_root *root = css->cgroup->root;
2234 if (strlen(buffer) >= PATH_MAX) 2188
2235 return -EINVAL; 2189 BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
2236 if (!cgroup_lock_live_group(css->cgroup)) 2190 if (!cgroup_lock_live_group(css->cgroup))
2237 return -ENODEV; 2191 return -ENODEV;
2238 mutex_lock(&cgroup_root_mutex); 2192 spin_lock(&release_agent_path_lock);
2239 strcpy(css->cgroup->root->release_agent_path, buffer); 2193 strlcpy(root->release_agent_path, buffer,
2240 mutex_unlock(&cgroup_root_mutex); 2194 sizeof(root->release_agent_path));
2195 spin_unlock(&release_agent_path_lock);
2241 mutex_unlock(&cgroup_mutex); 2196 mutex_unlock(&cgroup_mutex);
2242 return 0; 2197 return 0;
2243} 2198}
@@ -2262,32 +2217,23 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2262 return 0; 2217 return 0;
2263} 2218}
2264 2219
2265/* A buffer size big enough for numbers or short strings */ 2220static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2266#define CGROUP_LOCAL_BUFFER_SIZE 64 2221 size_t nbytes, loff_t off)
2267
2268static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2269 size_t nbytes, loff_t *ppos)
2270{ 2222{
2271 struct cfent *cfe = __d_cfe(file->f_dentry); 2223 struct cgroup *cgrp = of->kn->parent->priv;
2272 struct cftype *cft = __d_cft(file->f_dentry); 2224 struct cftype *cft = of->kn->priv;
2273 struct cgroup_subsys_state *css = cfe->css; 2225 struct cgroup_subsys_state *css;
2274 size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
2275 char *buf;
2276 int ret; 2226 int ret;
2277 2227
2278 if (nbytes >= max_bytes) 2228 /*
2279 return -E2BIG; 2229 * kernfs guarantees that a file isn't deleted with operations in
2280 2230 * flight, which means that the matching css is and stays alive and
2281 buf = kmalloc(nbytes + 1, GFP_KERNEL); 2231 * doesn't need to be pinned. The RCU locking is not necessary
2282 if (!buf) 2232 * either. It's just for the convenience of using cgroup_css().
2283 return -ENOMEM; 2233 */
2284 2234 rcu_read_lock();
2285 if (copy_from_user(buf, userbuf, nbytes)) { 2235 css = cgroup_css(cgrp, cft->ss);
2286 ret = -EFAULT; 2236 rcu_read_unlock();
2287 goto out_free;
2288 }
2289
2290 buf[nbytes] = '\0';
2291 2237
2292 if (cft->write_string) { 2238 if (cft->write_string) {
2293 ret = cft->write_string(css, cft, strstrip(buf)); 2239 ret = cft->write_string(css, cft, strstrip(buf));
@@ -2306,53 +2252,23 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
2306 } else { 2252 } else {
2307 ret = -EINVAL; 2253 ret = -EINVAL;
2308 } 2254 }
2309out_free: 2255
2310 kfree(buf);
2311 return ret ?: nbytes; 2256 return ret ?: nbytes;
2312} 2257}
2313 2258
2314/*
2315 * seqfile ops/methods for returning structured data. Currently just
2316 * supports string->u64 maps, but can be extended in future.
2317 */
2318
2319static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) 2259static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2320{ 2260{
2321 struct cftype *cft = seq_cft(seq); 2261 return seq_cft(seq)->seq_start(seq, ppos);
2322
2323 if (cft->seq_start) {
2324 return cft->seq_start(seq, ppos);
2325 } else {
2326 /*
2327 * The same behavior and code as single_open(). Returns
2328 * !NULL if pos is at the beginning; otherwise, NULL.
2329 */
2330 return NULL + !*ppos;
2331 }
2332} 2262}
2333 2263
2334static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) 2264static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2335{ 2265{
2336 struct cftype *cft = seq_cft(seq); 2266 return seq_cft(seq)->seq_next(seq, v, ppos);
2337
2338 if (cft->seq_next) {
2339 return cft->seq_next(seq, v, ppos);
2340 } else {
2341 /*
2342 * The same behavior and code as single_open(), always
2343 * terminate after the initial read.
2344 */
2345 ++*ppos;
2346 return NULL;
2347 }
2348} 2267}
2349 2268
2350static void cgroup_seqfile_stop(struct seq_file *seq, void *v) 2269static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2351{ 2270{
2352 struct cftype *cft = seq_cft(seq); 2271 seq_cft(seq)->seq_stop(seq, v);
2353
2354 if (cft->seq_stop)
2355 cft->seq_stop(seq, v);
2356} 2272}
2357 2273
2358static int cgroup_seqfile_show(struct seq_file *m, void *arg) 2274static int cgroup_seqfile_show(struct seq_file *m, void *arg)
@@ -2372,96 +2288,35 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2372 return 0; 2288 return 0;
2373} 2289}
2374 2290
2375static struct seq_operations cgroup_seq_operations = { 2291static struct kernfs_ops cgroup_kf_single_ops = {
2376 .start = cgroup_seqfile_start, 2292 .atomic_write_len = PAGE_SIZE,
2377 .next = cgroup_seqfile_next, 2293 .write = cgroup_file_write,
2378 .stop = cgroup_seqfile_stop, 2294 .seq_show = cgroup_seqfile_show,
2379 .show = cgroup_seqfile_show,
2380}; 2295};
2381 2296
2382static int cgroup_file_open(struct inode *inode, struct file *file) 2297static struct kernfs_ops cgroup_kf_ops = {
2383{ 2298 .atomic_write_len = PAGE_SIZE,
2384 struct cfent *cfe = __d_cfe(file->f_dentry); 2299 .write = cgroup_file_write,
2385 struct cftype *cft = __d_cft(file->f_dentry); 2300 .seq_start = cgroup_seqfile_start,
2386 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); 2301 .seq_next = cgroup_seqfile_next,
2387 struct cgroup_subsys_state *css; 2302 .seq_stop = cgroup_seqfile_stop,
2388 struct cgroup_open_file *of; 2303 .seq_show = cgroup_seqfile_show,
2389 int err; 2304};
2390
2391 err = generic_file_open(inode, file);
2392 if (err)
2393 return err;
2394
2395 /*
2396 * If the file belongs to a subsystem, pin the css. Will be
2397 * unpinned either on open failure or release. This ensures that
2398 * @css stays alive for all file operations.
2399 */
2400 rcu_read_lock();
2401 css = cgroup_css(cgrp, cft->ss);
2402 if (cft->ss && !css_tryget(css))
2403 css = NULL;
2404 rcu_read_unlock();
2405
2406 if (!css)
2407 return -ENODEV;
2408
2409 /*
2410 * @cfe->css is used by read/write/close to determine the
2411 * associated css. @file->private_data would be a better place but
2412 * that's already used by seqfile. Multiple accessors may use it
2413 * simultaneously which is okay as the association never changes.
2414 */
2415 WARN_ON_ONCE(cfe->css && cfe->css != css);
2416 cfe->css = css;
2417
2418 of = __seq_open_private(file, &cgroup_seq_operations,
2419 sizeof(struct cgroup_open_file));
2420 if (of) {
2421 of->cfe = cfe;
2422 return 0;
2423 }
2424
2425 if (css->ss)
2426 css_put(css);
2427 return -ENOMEM;
2428}
2429
2430static int cgroup_file_release(struct inode *inode, struct file *file)
2431{
2432 struct cfent *cfe = __d_cfe(file->f_dentry);
2433 struct cgroup_subsys_state *css = cfe->css;
2434
2435 if (css->ss)
2436 css_put(css);
2437 return seq_release_private(inode, file);
2438}
2439 2305
2440/* 2306/*
2441 * cgroup_rename - Only allow simple rename of directories in place. 2307 * cgroup_rename - Only allow simple rename of directories in place.
2442 */ 2308 */
2443static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, 2309static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2444 struct inode *new_dir, struct dentry *new_dentry) 2310 const char *new_name_str)
2445{ 2311{
2312 struct cgroup *cgrp = kn->priv;
2446 int ret; 2313 int ret;
2447 struct cgroup_name *name, *old_name;
2448 struct cgroup *cgrp;
2449
2450 /*
2451 * It's convinient to use parent dir's i_mutex to protected
2452 * cgrp->name.
2453 */
2454 lockdep_assert_held(&old_dir->i_mutex);
2455 2314
2456 if (!S_ISDIR(old_dentry->d_inode->i_mode)) 2315 if (kernfs_type(kn) != KERNFS_DIR)
2457 return -ENOTDIR; 2316 return -ENOTDIR;
2458 if (new_dentry->d_inode) 2317 if (kn->parent != new_parent)
2459 return -EEXIST;
2460 if (old_dir != new_dir)
2461 return -EIO; 2318 return -EIO;
2462 2319
2463 cgrp = __d_cgrp(old_dentry);
2464
2465 /* 2320 /*
2466 * This isn't a proper migration and its usefulness is very 2321 * This isn't a proper migration and its usefulness is very
2467 * limited. Disallow if sane_behavior. 2322 * limited. Disallow if sane_behavior.
@@ -2469,218 +2324,40 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2469 if (cgroup_sane_behavior(cgrp)) 2324 if (cgroup_sane_behavior(cgrp))
2470 return -EPERM; 2325 return -EPERM;
2471 2326
2472 name = cgroup_alloc_name(new_dentry); 2327 /*
2473 if (!name) 2328 * We're gonna grab cgroup_tree_mutex which nests outside kernfs
2474 return -ENOMEM; 2329 * active_ref. kernfs_rename() doesn't require active_ref
2475 2330 * protection. Break them before grabbing cgroup_tree_mutex.
2476 ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); 2331 */
2477 if (ret) { 2332 kernfs_break_active_protection(new_parent);
2478 kfree(name); 2333 kernfs_break_active_protection(kn);
2479 return ret;
2480 }
2481
2482 old_name = rcu_dereference_protected(cgrp->name, true);
2483 rcu_assign_pointer(cgrp->name, name);
2484
2485 kfree_rcu(old_name, rcu_head);
2486 return 0;
2487}
2488
2489static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2490{
2491 if (S_ISDIR(dentry->d_inode->i_mode))
2492 return &__d_cgrp(dentry)->xattrs;
2493 else
2494 return &__d_cfe(dentry)->xattrs;
2495}
2496
2497static inline int xattr_enabled(struct dentry *dentry)
2498{
2499 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2500 return root->flags & CGRP_ROOT_XATTR;
2501}
2502
2503static bool is_valid_xattr(const char *name)
2504{
2505 if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
2506 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
2507 return true;
2508 return false;
2509}
2510
2511static int cgroup_setxattr(struct dentry *dentry, const char *name,
2512 const void *val, size_t size, int flags)
2513{
2514 if (!xattr_enabled(dentry))
2515 return -EOPNOTSUPP;
2516 if (!is_valid_xattr(name))
2517 return -EINVAL;
2518 return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
2519}
2520
2521static int cgroup_removexattr(struct dentry *dentry, const char *name)
2522{
2523 if (!xattr_enabled(dentry))
2524 return -EOPNOTSUPP;
2525 if (!is_valid_xattr(name))
2526 return -EINVAL;
2527 return simple_xattr_remove(__d_xattrs(dentry), name);
2528}
2529
2530static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
2531 void *buf, size_t size)
2532{
2533 if (!xattr_enabled(dentry))
2534 return -EOPNOTSUPP;
2535 if (!is_valid_xattr(name))
2536 return -EINVAL;
2537 return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
2538}
2539
2540static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
2541{
2542 if (!xattr_enabled(dentry))
2543 return -EOPNOTSUPP;
2544 return simple_xattr_list(__d_xattrs(dentry), buf, size);
2545}
2546
2547static const struct file_operations cgroup_file_operations = {
2548 .read = seq_read,
2549 .write = cgroup_file_write,
2550 .llseek = generic_file_llseek,
2551 .open = cgroup_file_open,
2552 .release = cgroup_file_release,
2553};
2554
2555static const struct inode_operations cgroup_file_inode_operations = {
2556 .setxattr = cgroup_setxattr,
2557 .getxattr = cgroup_getxattr,
2558 .listxattr = cgroup_listxattr,
2559 .removexattr = cgroup_removexattr,
2560};
2561
2562static const struct inode_operations cgroup_dir_inode_operations = {
2563 .lookup = simple_lookup,
2564 .mkdir = cgroup_mkdir,
2565 .rmdir = cgroup_rmdir,
2566 .rename = cgroup_rename,
2567 .setxattr = cgroup_setxattr,
2568 .getxattr = cgroup_getxattr,
2569 .listxattr = cgroup_listxattr,
2570 .removexattr = cgroup_removexattr,
2571};
2572
2573static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2574 struct super_block *sb)
2575{
2576 struct inode *inode;
2577
2578 if (!dentry)
2579 return -ENOENT;
2580 if (dentry->d_inode)
2581 return -EEXIST;
2582
2583 inode = cgroup_new_inode(mode, sb);
2584 if (!inode)
2585 return -ENOMEM;
2586
2587 if (S_ISDIR(mode)) {
2588 inode->i_op = &cgroup_dir_inode_operations;
2589 inode->i_fop = &simple_dir_operations;
2590
2591 /* start off with i_nlink == 2 (for "." entry) */
2592 inc_nlink(inode);
2593 inc_nlink(dentry->d_parent->d_inode);
2594
2595 /*
2596 * Control reaches here with cgroup_mutex held.
2597 * @inode->i_mutex should nest outside cgroup_mutex but we
2598 * want to populate it immediately without releasing
2599 * cgroup_mutex. As @inode isn't visible to anyone else
2600 * yet, trylock will always succeed without affecting
2601 * lockdep checks.
2602 */
2603 WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
2604 } else if (S_ISREG(mode)) {
2605 inode->i_size = 0;
2606 inode->i_fop = &cgroup_file_operations;
2607 inode->i_op = &cgroup_file_inode_operations;
2608 }
2609 d_instantiate(dentry, inode);
2610 dget(dentry); /* Extra count - pin the dentry in core */
2611 return 0;
2612}
2613
2614/**
2615 * cgroup_file_mode - deduce file mode of a control file
2616 * @cft: the control file in question
2617 *
2618 * returns cft->mode if ->mode is not 0
2619 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
2620 * returns S_IRUGO if it has only a read handler
2621 * returns S_IWUSR if it has only a write hander
2622 */
2623static umode_t cgroup_file_mode(const struct cftype *cft)
2624{
2625 umode_t mode = 0;
2626 2334
2627 if (cft->mode) 2335 mutex_lock(&cgroup_tree_mutex);
2628 return cft->mode; 2336 mutex_lock(&cgroup_mutex);
2629 2337
2630 if (cft->read_u64 || cft->read_s64 || cft->seq_show) 2338 ret = kernfs_rename(kn, new_parent, new_name_str);
2631 mode |= S_IRUGO;
2632 2339
2633 if (cft->write_u64 || cft->write_s64 || cft->write_string || 2340 mutex_unlock(&cgroup_mutex);
2634 cft->trigger) 2341 mutex_unlock(&cgroup_tree_mutex);
2635 mode |= S_IWUSR;
2636 2342
2637 return mode; 2343 kernfs_unbreak_active_protection(kn);
2344 kernfs_unbreak_active_protection(new_parent);
2345 return ret;
2638} 2346}
2639 2347
2640static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) 2348static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2641{ 2349{
2642 struct dentry *dir = cgrp->dentry; 2350 char name[CGROUP_FILE_NAME_MAX];
2643 struct cgroup *parent = __d_cgrp(dir); 2351 struct kernfs_node *kn;
2644 struct dentry *dentry; 2352 struct lock_class_key *key = NULL;
2645 struct cfent *cfe;
2646 int error;
2647 umode_t mode;
2648 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2649
2650 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
2651 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2652 strcpy(name, cft->ss->name);
2653 strcat(name, ".");
2654 }
2655 strcat(name, cft->name);
2656
2657 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
2658
2659 cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
2660 if (!cfe)
2661 return -ENOMEM;
2662 2353
2663 dentry = lookup_one_len(name, dir, strlen(name)); 2354#ifdef CONFIG_DEBUG_LOCK_ALLOC
2664 if (IS_ERR(dentry)) { 2355 key = &cft->lockdep_key;
2665 error = PTR_ERR(dentry); 2356#endif
2666 goto out; 2357 kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
2667 } 2358 cgroup_file_mode(cft), 0, cft->kf_ops, cft,
2668 2359 NULL, false, key);
2669 cfe->type = (void *)cft; 2360 return PTR_ERR_OR_ZERO(kn);
2670 cfe->dentry = dentry;
2671 dentry->d_fsdata = cfe;
2672 simple_xattrs_init(&cfe->xattrs);
2673
2674 mode = cgroup_file_mode(cft);
2675 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
2676 if (!error) {
2677 list_add_tail(&cfe->node, &parent->files);
2678 cfe = NULL;
2679 }
2680 dput(dentry);
2681out:
2682 kfree(cfe);
2683 return error;
2684} 2361}
2685 2362
2686/** 2363/**
@@ -2700,11 +2377,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2700 struct cftype *cft; 2377 struct cftype *cft;
2701 int ret; 2378 int ret;
2702 2379
2703 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 2380 lockdep_assert_held(&cgroup_tree_mutex);
2704 lockdep_assert_held(&cgroup_mutex);
2705 2381
2706 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2382 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2707 /* does cft->flags tell us to skip this file on @cgrp? */ 2383 /* does cft->flags tell us to skip this file on @cgrp? */
2384 if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
2385 continue;
2708 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) 2386 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2709 continue; 2387 continue;
2710 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2388 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
@@ -2726,44 +2404,19 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2726 return 0; 2404 return 0;
2727} 2405}
2728 2406
2729static void cgroup_cfts_prepare(void) 2407static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
2730 __acquires(&cgroup_mutex)
2731{
2732 /*
2733 * Thanks to the entanglement with vfs inode locking, we can't walk
2734 * the existing cgroups under cgroup_mutex and create files.
2735 * Instead, we use css_for_each_descendant_pre() and drop RCU read
2736 * lock before calling cgroup_addrm_files().
2737 */
2738 mutex_lock(&cgroup_mutex);
2739}
2740
2741static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2742 __releases(&cgroup_mutex)
2743{ 2408{
2744 LIST_HEAD(pending); 2409 LIST_HEAD(pending);
2745 struct cgroup_subsys *ss = cfts[0].ss; 2410 struct cgroup_subsys *ss = cfts[0].ss;
2746 struct cgroup *root = &ss->root->top_cgroup; 2411 struct cgroup *root = &ss->root->cgrp;
2747 struct super_block *sb = ss->root->sb;
2748 struct dentry *prev = NULL;
2749 struct inode *inode;
2750 struct cgroup_subsys_state *css; 2412 struct cgroup_subsys_state *css;
2751 u64 update_before;
2752 int ret = 0; 2413 int ret = 0;
2753 2414
2754 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2415 lockdep_assert_held(&cgroup_tree_mutex);
2755 if (!cfts || ss->root == &cgroup_dummy_root ||
2756 !atomic_inc_not_zero(&sb->s_active)) {
2757 mutex_unlock(&cgroup_mutex);
2758 return 0;
2759 }
2760 2416
2761 /* 2417 /* don't bother if @ss isn't attached */
2762 * All cgroups which are created after we drop cgroup_mutex will 2418 if (ss->root == &cgrp_dfl_root)
2763 * have the updated set of files, so we only need to update the 2419 return 0;
2764 * cgroups created before the current @cgroup_serial_nr_next.
2765 */
2766 update_before = cgroup_serial_nr_next;
2767 2420
2768 /* add/rm files for all cgroups created before */ 2421 /* add/rm files for all cgroups created before */
2769 css_for_each_descendant_pre(css, cgroup_css(root, ss)) { 2422 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
@@ -2772,62 +2425,75 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2772 if (cgroup_is_dead(cgrp)) 2425 if (cgroup_is_dead(cgrp))
2773 continue; 2426 continue;
2774 2427
2775 inode = cgrp->dentry->d_inode; 2428 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2776 dget(cgrp->dentry);
2777 dput(prev);
2778 prev = cgrp->dentry;
2779
2780 mutex_unlock(&cgroup_mutex);
2781 mutex_lock(&inode->i_mutex);
2782 mutex_lock(&cgroup_mutex);
2783 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2784 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2785 mutex_unlock(&inode->i_mutex);
2786 if (ret) 2429 if (ret)
2787 break; 2430 break;
2788 } 2431 }
2789 mutex_unlock(&cgroup_mutex); 2432
2790 dput(prev); 2433 if (is_add && !ret)
2791 deactivate_super(sb); 2434 kernfs_activate(root->kn);
2792 return ret; 2435 return ret;
2793} 2436}
2794 2437
2795/** 2438static void cgroup_exit_cftypes(struct cftype *cfts)
2796 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2797 * @ss: target cgroup subsystem
2798 * @cfts: zero-length name terminated array of cftypes
2799 *
2800 * Register @cfts to @ss. Files described by @cfts are created for all
2801 * existing cgroups to which @ss is attached and all future cgroups will
2802 * have them too. This function can be called anytime whether @ss is
2803 * attached or not.
2804 *
2805 * Returns 0 on successful registration, -errno on failure. Note that this
2806 * function currently returns 0 as long as @cfts registration is successful
2807 * even if some file creation attempts on existing cgroups fail.
2808 */
2809int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2810{ 2439{
2811 struct cftype_set *set;
2812 struct cftype *cft; 2440 struct cftype *cft;
2813 int ret;
2814 2441
2815 set = kzalloc(sizeof(*set), GFP_KERNEL); 2442 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2816 if (!set) 2443 /* free copy for custom atomic_write_len, see init_cftypes() */
2817 return -ENOMEM; 2444 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
2445 kfree(cft->kf_ops);
2446 cft->kf_ops = NULL;
2447 cft->ss = NULL;
2448 }
2449}
2818 2450
2819 for (cft = cfts; cft->name[0] != '\0'; cft++) 2451static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2452{
2453 struct cftype *cft;
2454
2455 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2456 struct kernfs_ops *kf_ops;
2457
2458 WARN_ON(cft->ss || cft->kf_ops);
2459
2460 if (cft->seq_start)
2461 kf_ops = &cgroup_kf_ops;
2462 else
2463 kf_ops = &cgroup_kf_single_ops;
2464
2465 /*
2466 * Ugh... if @cft wants a custom max_write_len, we need to
2467 * make a copy of kf_ops to set its atomic_write_len.
2468 */
2469 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
2470 kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
2471 if (!kf_ops) {
2472 cgroup_exit_cftypes(cfts);
2473 return -ENOMEM;
2474 }
2475 kf_ops->atomic_write_len = cft->max_write_len;
2476 }
2477
2478 cft->kf_ops = kf_ops;
2820 cft->ss = ss; 2479 cft->ss = ss;
2480 }
2821 2481
2822 cgroup_cfts_prepare(); 2482 return 0;
2823 set->cfts = cfts; 2483}
2824 list_add_tail(&set->node, &ss->cftsets); 2484
2825 ret = cgroup_cfts_commit(cfts, true); 2485static int cgroup_rm_cftypes_locked(struct cftype *cfts)
2826 if (ret) 2486{
2827 cgroup_rm_cftypes(cfts); 2487 lockdep_assert_held(&cgroup_tree_mutex);
2828 return ret; 2488
2489 if (!cfts || !cfts[0].ss)
2490 return -ENOENT;
2491
2492 list_del(&cfts->node);
2493 cgroup_apply_cftypes(cfts, false);
2494 cgroup_exit_cftypes(cfts);
2495 return 0;
2829} 2496}
2830EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2831 2497
2832/** 2498/**
2833 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem 2499 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
@@ -2842,24 +2508,48 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2842 */ 2508 */
2843int cgroup_rm_cftypes(struct cftype *cfts) 2509int cgroup_rm_cftypes(struct cftype *cfts)
2844{ 2510{
2845 struct cftype_set *set; 2511 int ret;
2846 2512
2847 if (!cfts || !cfts[0].ss) 2513 mutex_lock(&cgroup_tree_mutex);
2848 return -ENOENT; 2514 ret = cgroup_rm_cftypes_locked(cfts);
2515 mutex_unlock(&cgroup_tree_mutex);
2516 return ret;
2517}
2849 2518
2850 cgroup_cfts_prepare(); 2519/**
2520 * cgroup_add_cftypes - add an array of cftypes to a subsystem
2521 * @ss: target cgroup subsystem
2522 * @cfts: zero-length name terminated array of cftypes
2523 *
2524 * Register @cfts to @ss. Files described by @cfts are created for all
2525 * existing cgroups to which @ss is attached and all future cgroups will
2526 * have them too. This function can be called anytime whether @ss is
2527 * attached or not.
2528 *
2529 * Returns 0 on successful registration, -errno on failure. Note that this
2530 * function currently returns 0 as long as @cfts registration is successful
2531 * even if some file creation attempts on existing cgroups fail.
2532 */
2533int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2534{
2535 int ret;
2851 2536
2852 list_for_each_entry(set, &cfts[0].ss->cftsets, node) { 2537 if (!cfts || cfts[0].name[0] == '\0')
2853 if (set->cfts == cfts) { 2538 return 0;
2854 list_del(&set->node); 2539
2855 kfree(set); 2540 ret = cgroup_init_cftypes(ss, cfts);
2856 cgroup_cfts_commit(cfts, false); 2541 if (ret)
2857 return 0; 2542 return ret;
2858 } 2543
2859 } 2544 mutex_lock(&cgroup_tree_mutex);
2860 2545
2861 cgroup_cfts_commit(NULL, false); 2546 list_add_tail(&cfts->node, &ss->cfts);
2862 return -ENOENT; 2547 ret = cgroup_apply_cftypes(cfts, true);
2548 if (ret)
2549 cgroup_rm_cftypes_locked(cfts);
2550
2551 mutex_unlock(&cgroup_tree_mutex);
2552 return ret;
2863} 2553}
2864 2554
2865/** 2555/**
@@ -2868,57 +2558,18 @@ int cgroup_rm_cftypes(struct cftype *cfts)
2868 * 2558 *
2869 * Return the number of tasks in the cgroup. 2559 * Return the number of tasks in the cgroup.
2870 */ 2560 */
2871int cgroup_task_count(const struct cgroup *cgrp) 2561static int cgroup_task_count(const struct cgroup *cgrp)
2872{ 2562{
2873 int count = 0; 2563 int count = 0;
2874 struct cgrp_cset_link *link; 2564 struct cgrp_cset_link *link;
2875 2565
2876 read_lock(&css_set_lock); 2566 down_read(&css_set_rwsem);
2877 list_for_each_entry(link, &cgrp->cset_links, cset_link) 2567 list_for_each_entry(link, &cgrp->cset_links, cset_link)
2878 count += atomic_read(&link->cset->refcount); 2568 count += atomic_read(&link->cset->refcount);
2879 read_unlock(&css_set_lock); 2569 up_read(&css_set_rwsem);
2880 return count; 2570 return count;
2881} 2571}
2882 2572
2883/*
2884 * To reduce the fork() overhead for systems that are not actually using
2885 * their cgroups capability, we don't maintain the lists running through
2886 * each css_set to its tasks until we see the list actually used - in other
2887 * words after the first call to css_task_iter_start().
2888 */
2889static void cgroup_enable_task_cg_lists(void)
2890{
2891 struct task_struct *p, *g;
2892 write_lock(&css_set_lock);
2893 use_task_css_set_links = 1;
2894 /*
2895 * We need tasklist_lock because RCU is not safe against
2896 * while_each_thread(). Besides, a forking task that has passed
2897 * cgroup_post_fork() without seeing use_task_css_set_links = 1
2898 * is not guaranteed to have its child immediately visible in the
2899 * tasklist if we walk through it with RCU.
2900 */
2901 read_lock(&tasklist_lock);
2902 do_each_thread(g, p) {
2903 task_lock(p);
2904 /*
2905 * We should check if the process is exiting, otherwise
2906 * it will race with cgroup_exit() in that the list
2907 * entry won't be deleted though the process has exited.
2908 * Do it while holding siglock so that we don't end up
2909 * racing against cgroup_exit().
2910 */
2911 spin_lock_irq(&p->sighand->siglock);
2912 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2913 list_add(&p->cg_list, &task_css_set(p)->tasks);
2914 spin_unlock_irq(&p->sighand->siglock);
2915
2916 task_unlock(p);
2917 } while_each_thread(g, p);
2918 read_unlock(&tasklist_lock);
2919 write_unlock(&css_set_lock);
2920}
2921
2922/** 2573/**
2923 * css_next_child - find the next child of a given css 2574 * css_next_child - find the next child of a given css
2924 * @pos_css: the current position (%NULL to initiate traversal) 2575 * @pos_css: the current position (%NULL to initiate traversal)
@@ -2937,7 +2588,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
2937 struct cgroup *cgrp = parent_css->cgroup; 2588 struct cgroup *cgrp = parent_css->cgroup;
2938 struct cgroup *next; 2589 struct cgroup *next;
2939 2590
2940 cgroup_assert_mutex_or_rcu_locked(); 2591 cgroup_assert_mutexes_or_rcu_locked();
2941 2592
2942 /* 2593 /*
2943 * @pos could already have been removed. Once a cgroup is removed, 2594 * @pos could already have been removed. Once a cgroup is removed,
@@ -2973,7 +2624,6 @@ css_next_child(struct cgroup_subsys_state *pos_css,
2973 2624
2974 return cgroup_css(next, parent_css->ss); 2625 return cgroup_css(next, parent_css->ss);
2975} 2626}
2976EXPORT_SYMBOL_GPL(css_next_child);
2977 2627
2978/** 2628/**
2979 * css_next_descendant_pre - find the next descendant for pre-order walk 2629 * css_next_descendant_pre - find the next descendant for pre-order walk
@@ -2995,7 +2645,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
2995{ 2645{
2996 struct cgroup_subsys_state *next; 2646 struct cgroup_subsys_state *next;
2997 2647
2998 cgroup_assert_mutex_or_rcu_locked(); 2648 cgroup_assert_mutexes_or_rcu_locked();
2999 2649
3000 /* if first iteration, visit @root */ 2650 /* if first iteration, visit @root */
3001 if (!pos) 2651 if (!pos)
@@ -3016,7 +2666,6 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
3016 2666
3017 return NULL; 2667 return NULL;
3018} 2668}
3019EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3020 2669
3021/** 2670/**
3022 * css_rightmost_descendant - return the rightmost descendant of a css 2671 * css_rightmost_descendant - return the rightmost descendant of a css
@@ -3036,7 +2685,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
3036{ 2685{
3037 struct cgroup_subsys_state *last, *tmp; 2686 struct cgroup_subsys_state *last, *tmp;
3038 2687
3039 cgroup_assert_mutex_or_rcu_locked(); 2688 cgroup_assert_mutexes_or_rcu_locked();
3040 2689
3041 do { 2690 do {
3042 last = pos; 2691 last = pos;
@@ -3048,7 +2697,6 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
3048 2697
3049 return last; 2698 return last;
3050} 2699}
3051EXPORT_SYMBOL_GPL(css_rightmost_descendant);
3052 2700
3053static struct cgroup_subsys_state * 2701static struct cgroup_subsys_state *
3054css_leftmost_descendant(struct cgroup_subsys_state *pos) 2702css_leftmost_descendant(struct cgroup_subsys_state *pos)
@@ -3084,7 +2732,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
3084{ 2732{
3085 struct cgroup_subsys_state *next; 2733 struct cgroup_subsys_state *next;
3086 2734
3087 cgroup_assert_mutex_or_rcu_locked(); 2735 cgroup_assert_mutexes_or_rcu_locked();
3088 2736
3089 /* if first iteration, visit leftmost descendant which may be @root */ 2737 /* if first iteration, visit leftmost descendant which may be @root */
3090 if (!pos) 2738 if (!pos)
@@ -3102,7 +2750,6 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
3102 /* no sibling left, visit parent */ 2750 /* no sibling left, visit parent */
3103 return css_parent(pos); 2751 return css_parent(pos);
3104} 2752}
3105EXPORT_SYMBOL_GPL(css_next_descendant_post);
3106 2753
3107/** 2754/**
3108 * css_advance_task_iter - advance a task itererator to the next css_set 2755 * css_advance_task_iter - advance a task itererator to the next css_set
@@ -3125,9 +2772,14 @@ static void css_advance_task_iter(struct css_task_iter *it)
3125 } 2772 }
3126 link = list_entry(l, struct cgrp_cset_link, cset_link); 2773 link = list_entry(l, struct cgrp_cset_link, cset_link);
3127 cset = link->cset; 2774 cset = link->cset;
3128 } while (list_empty(&cset->tasks)); 2775 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
2776
3129 it->cset_link = l; 2777 it->cset_link = l;
3130 it->task = cset->tasks.next; 2778
2779 if (!list_empty(&cset->tasks))
2780 it->task = cset->tasks.next;
2781 else
2782 it->task = cset->mg_tasks.next;
3131} 2783}
3132 2784
3133/** 2785/**
@@ -3146,17 +2798,12 @@ static void css_advance_task_iter(struct css_task_iter *it)
3146 */ 2798 */
3147void css_task_iter_start(struct cgroup_subsys_state *css, 2799void css_task_iter_start(struct cgroup_subsys_state *css,
3148 struct css_task_iter *it) 2800 struct css_task_iter *it)
3149 __acquires(css_set_lock) 2801 __acquires(css_set_rwsem)
3150{ 2802{
3151 /* 2803 /* no one should try to iterate before mounting cgroups */
3152 * The first time anyone tries to iterate across a css, we need to 2804 WARN_ON_ONCE(!use_task_css_set_links);
3153 * enable the list linking each css_set to its tasks, and fix up
3154 * all existing tasks.
3155 */
3156 if (!use_task_css_set_links)
3157 cgroup_enable_task_cg_lists();
3158 2805
3159 read_lock(&css_set_lock); 2806 down_read(&css_set_rwsem);
3160 2807
3161 it->origin_css = css; 2808 it->origin_css = css;
3162 it->cset_link = &css->cgroup->cset_links; 2809 it->cset_link = &css->cgroup->cset_links;
@@ -3176,24 +2823,29 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
3176{ 2823{
3177 struct task_struct *res; 2824 struct task_struct *res;
3178 struct list_head *l = it->task; 2825 struct list_head *l = it->task;
3179 struct cgrp_cset_link *link; 2826 struct cgrp_cset_link *link = list_entry(it->cset_link,
2827 struct cgrp_cset_link, cset_link);
3180 2828
3181 /* If the iterator cg is NULL, we have no tasks */ 2829 /* If the iterator cg is NULL, we have no tasks */
3182 if (!it->cset_link) 2830 if (!it->cset_link)
3183 return NULL; 2831 return NULL;
3184 res = list_entry(l, struct task_struct, cg_list); 2832 res = list_entry(l, struct task_struct, cg_list);
3185 /* Advance iterator to find next entry */ 2833
2834 /*
2835 * Advance iterator to find next entry. cset->tasks is consumed
2836 * first and then ->mg_tasks. After ->mg_tasks, we move onto the
2837 * next cset.
2838 */
3186 l = l->next; 2839 l = l->next;
3187 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); 2840
3188 if (l == &link->cset->tasks) { 2841 if (l == &link->cset->tasks)
3189 /* 2842 l = link->cset->mg_tasks.next;
3190 * We reached the end of this task list - move on to the 2843
3191 * next cgrp_cset_link. 2844 if (l == &link->cset->mg_tasks)
3192 */
3193 css_advance_task_iter(it); 2845 css_advance_task_iter(it);
3194 } else { 2846 else
3195 it->task = l; 2847 it->task = l;
3196 } 2848
3197 return res; 2849 return res;
3198} 2850}
3199 2851
@@ -3204,191 +2856,62 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
3204 * Finish task iteration started by css_task_iter_start(). 2856 * Finish task iteration started by css_task_iter_start().
3205 */ 2857 */
3206void css_task_iter_end(struct css_task_iter *it) 2858void css_task_iter_end(struct css_task_iter *it)
3207 __releases(css_set_lock) 2859 __releases(css_set_rwsem)
3208{
3209 read_unlock(&css_set_lock);
3210}
3211
3212static inline int started_after_time(struct task_struct *t1,
3213 struct timespec *time,
3214 struct task_struct *t2)
3215{
3216 int start_diff = timespec_compare(&t1->start_time, time);
3217 if (start_diff > 0) {
3218 return 1;
3219 } else if (start_diff < 0) {
3220 return 0;
3221 } else {
3222 /*
3223 * Arbitrarily, if two processes started at the same
3224 * time, we'll say that the lower pointer value
3225 * started first. Note that t2 may have exited by now
3226 * so this may not be a valid pointer any longer, but
3227 * that's fine - it still serves to distinguish
3228 * between two tasks started (effectively) simultaneously.
3229 */
3230 return t1 > t2;
3231 }
3232}
3233
3234/*
3235 * This function is a callback from heap_insert() and is used to order
3236 * the heap.
3237 * In this case we order the heap in descending task start time.
3238 */
3239static inline int started_after(void *p1, void *p2)
3240{ 2860{
3241 struct task_struct *t1 = p1; 2861 up_read(&css_set_rwsem);
3242 struct task_struct *t2 = p2;
3243 return started_after_time(t1, &t2->start_time, t2);
3244} 2862}
3245 2863
3246/** 2864/**
3247 * css_scan_tasks - iterate though all the tasks in a css 2865 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3248 * @css: the css to iterate tasks of 2866 * @to: cgroup to which the tasks will be moved
3249 * @test: optional test callback 2867 * @from: cgroup in which the tasks currently reside
3250 * @process: process callback
3251 * @data: data passed to @test and @process
3252 * @heap: optional pre-allocated heap used for task iteration
3253 *
3254 * Iterate through all the tasks in @css, calling @test for each, and if it
3255 * returns %true, call @process for it also.
3256 *
3257 * @test may be NULL, meaning always true (select all tasks), which
3258 * effectively duplicates css_task_iter_{start,next,end}() but does not
3259 * lock css_set_lock for the call to @process.
3260 *
3261 * It is guaranteed that @process will act on every task that is a member
3262 * of @css for the duration of this call. This function may or may not
3263 * call @process for tasks that exit or move to a different css during the
3264 * call, or are forked or move into the css during the call.
3265 *
3266 * Note that @test may be called with locks held, and may in some
3267 * situations be called multiple times for the same task, so it should be
3268 * cheap.
3269 * 2868 *
3270 * If @heap is non-NULL, a heap has been pre-allocated and will be used for 2869 * Locking rules between cgroup_post_fork() and the migration path
3271 * heap operations (and its "gt" member will be overwritten), else a 2870 * guarantee that, if a task is forking while being migrated, the new child
3272 * temporary heap will be used (allocation of which may cause this function 2871 * is guaranteed to be either visible in the source cgroup after the
3273 * to fail). 2872 * parent's migration is complete or put into the target cgroup. No task
2873 * can slip out of migration through forking.
3274 */ 2874 */
3275int css_scan_tasks(struct cgroup_subsys_state *css, 2875int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3276 bool (*test)(struct task_struct *, void *),
3277 void (*process)(struct task_struct *, void *),
3278 void *data, struct ptr_heap *heap)
3279{ 2876{
3280 int retval, i; 2877 LIST_HEAD(preloaded_csets);
2878 struct cgrp_cset_link *link;
3281 struct css_task_iter it; 2879 struct css_task_iter it;
3282 struct task_struct *p, *dropped; 2880 struct task_struct *task;
3283 /* Never dereference latest_task, since it's not refcounted */ 2881 int ret;
3284 struct task_struct *latest_task = NULL;
3285 struct ptr_heap tmp_heap;
3286 struct timespec latest_time = { 0, 0 };
3287
3288 if (heap) {
3289 /* The caller supplied our heap and pre-allocated its memory */
3290 heap->gt = &started_after;
3291 } else {
3292 /* We need to allocate our own heap memory */
3293 heap = &tmp_heap;
3294 retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
3295 if (retval)
3296 /* cannot allocate the heap */
3297 return retval;
3298 }
3299 2882
3300 again: 2883 mutex_lock(&cgroup_mutex);
3301 /*
3302 * Scan tasks in the css, using the @test callback to determine
3303 * which are of interest, and invoking @process callback on the
3304 * ones which need an update. Since we don't want to hold any
3305 * locks during the task updates, gather tasks to be processed in a
3306 * heap structure. The heap is sorted by descending task start
3307 * time. If the statically-sized heap fills up, we overflow tasks
3308 * that started later, and in future iterations only consider tasks
3309 * that started after the latest task in the previous pass. This
3310 * guarantees forward progress and that we don't miss any tasks.
3311 */
3312 heap->size = 0;
3313 css_task_iter_start(css, &it);
3314 while ((p = css_task_iter_next(&it))) {
3315 /*
3316 * Only affect tasks that qualify per the caller's callback,
3317 * if he provided one
3318 */
3319 if (test && !test(p, data))
3320 continue;
3321 /*
3322 * Only process tasks that started after the last task
3323 * we processed
3324 */
3325 if (!started_after_time(p, &latest_time, latest_task))
3326 continue;
3327 dropped = heap_insert(heap, p);
3328 if (dropped == NULL) {
3329 /*
3330 * The new task was inserted; the heap wasn't
3331 * previously full
3332 */
3333 get_task_struct(p);
3334 } else if (dropped != p) {
3335 /*
3336 * The new task was inserted, and pushed out a
3337 * different task
3338 */
3339 get_task_struct(p);
3340 put_task_struct(dropped);
3341 }
3342 /*
3343 * Else the new task was newer than anything already in
3344 * the heap and wasn't inserted
3345 */
3346 }
3347 css_task_iter_end(&it);
3348 2884
3349 if (heap->size) { 2885 /* all tasks in @from are being moved, all csets are source */
3350 for (i = 0; i < heap->size; i++) { 2886 down_read(&css_set_rwsem);
3351 struct task_struct *q = heap->ptrs[i]; 2887 list_for_each_entry(link, &from->cset_links, cset_link)
3352 if (i == 0) { 2888 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
3353 latest_time = q->start_time; 2889 up_read(&css_set_rwsem);
3354 latest_task = q;
3355 }
3356 /* Process the task per the caller's callback */
3357 process(q, data);
3358 put_task_struct(q);
3359 }
3360 /*
3361 * If we had to process any tasks at all, scan again
3362 * in case some of them were in the middle of forking
3363 * children that didn't get processed.
3364 * Not the most efficient way to do it, but it avoids
3365 * having to take callback_mutex in the fork path
3366 */
3367 goto again;
3368 }
3369 if (heap == &tmp_heap)
3370 heap_free(&tmp_heap);
3371 return 0;
3372}
3373 2890
3374static void cgroup_transfer_one_task(struct task_struct *task, void *data) 2891 ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
3375{ 2892 if (ret)
3376 struct cgroup *new_cgroup = data; 2893 goto out_err;
3377 2894
3378 mutex_lock(&cgroup_mutex); 2895 /*
3379 cgroup_attach_task(new_cgroup, task, false); 2896 * Migrate tasks one-by-one until @form is empty. This fails iff
2897 * ->can_attach() fails.
2898 */
2899 do {
2900 css_task_iter_start(&from->dummy_css, &it);
2901 task = css_task_iter_next(&it);
2902 if (task)
2903 get_task_struct(task);
2904 css_task_iter_end(&it);
2905
2906 if (task) {
2907 ret = cgroup_migrate(to, task, false);
2908 put_task_struct(task);
2909 }
2910 } while (task && !ret);
2911out_err:
2912 cgroup_migrate_finish(&preloaded_csets);
3380 mutex_unlock(&cgroup_mutex); 2913 mutex_unlock(&cgroup_mutex);
3381} 2914 return ret;
3382
3383/**
3384 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3385 * @to: cgroup to which the tasks will be moved
3386 * @from: cgroup in which the tasks currently reside
3387 */
3388int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3389{
3390 return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
3391 to, NULL);
3392} 2915}
3393 2916
3394/* 2917/*
@@ -3687,21 +3210,31 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3687 */ 3210 */
3688int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) 3211int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3689{ 3212{
3690 int ret = -EINVAL; 3213 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
3691 struct cgroup *cgrp; 3214 struct cgroup *cgrp;
3692 struct css_task_iter it; 3215 struct css_task_iter it;
3693 struct task_struct *tsk; 3216 struct task_struct *tsk;
3694 3217
3218 /* it should be kernfs_node belonging to cgroupfs and is a directory */
3219 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
3220 kernfs_type(kn) != KERNFS_DIR)
3221 return -EINVAL;
3222
3223 mutex_lock(&cgroup_mutex);
3224
3695 /* 3225 /*
3696 * Validate dentry by checking the superblock operations, 3226 * We aren't being called from kernfs and there's no guarantee on
3697 * and make sure it's a directory. 3227 * @kn->priv's validity. For this and css_tryget_from_dir(),
3228 * @kn->priv is RCU safe. Let's do the RCU dancing.
3698 */ 3229 */
3699 if (dentry->d_sb->s_op != &cgroup_ops || 3230 rcu_read_lock();
3700 !S_ISDIR(dentry->d_inode->i_mode)) 3231 cgrp = rcu_dereference(kn->priv);
3701 goto err; 3232 if (!cgrp || cgroup_is_dead(cgrp)) {
3702 3233 rcu_read_unlock();
3703 ret = 0; 3234 mutex_unlock(&cgroup_mutex);
3704 cgrp = dentry->d_fsdata; 3235 return -ENOENT;
3236 }
3237 rcu_read_unlock();
3705 3238
3706 css_task_iter_start(&cgrp->dummy_css, &it); 3239 css_task_iter_start(&cgrp->dummy_css, &it);
3707 while ((tsk = css_task_iter_next(&it))) { 3240 while ((tsk = css_task_iter_next(&it))) {
@@ -3726,8 +3259,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3726 } 3259 }
3727 css_task_iter_end(&it); 3260 css_task_iter_end(&it);
3728 3261
3729err: 3262 mutex_unlock(&cgroup_mutex);
3730 return ret; 3263 return 0;
3731} 3264}
3732 3265
3733 3266
@@ -3745,7 +3278,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3745 * after a seek to the start). Use a binary-search to find the 3278 * after a seek to the start). Use a binary-search to find the
3746 * next pid to display, if any 3279 * next pid to display, if any
3747 */ 3280 */
3748 struct cgroup_open_file *of = s->private; 3281 struct kernfs_open_file *of = s->private;
3749 struct cgroup *cgrp = seq_css(s)->cgroup; 3282 struct cgroup *cgrp = seq_css(s)->cgroup;
3750 struct cgroup_pidlist *l; 3283 struct cgroup_pidlist *l;
3751 enum cgroup_filetype type = seq_cft(s)->private; 3284 enum cgroup_filetype type = seq_cft(s)->private;
@@ -3800,7 +3333,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3800 3333
3801static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3334static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3802{ 3335{
3803 struct cgroup_open_file *of = s->private; 3336 struct kernfs_open_file *of = s->private;
3804 struct cgroup_pidlist *l = of->priv; 3337 struct cgroup_pidlist *l = of->priv;
3805 3338
3806 if (l) 3339 if (l)
@@ -3811,7 +3344,7 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3811 3344
3812static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3345static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3813{ 3346{
3814 struct cgroup_open_file *of = s->private; 3347 struct kernfs_open_file *of = s->private;
3815 struct cgroup_pidlist *l = of->priv; 3348 struct cgroup_pidlist *l = of->priv;
3816 pid_t *p = v; 3349 pid_t *p = v;
3817 pid_t *end = l->list + l->length; 3350 pid_t *end = l->list + l->length;
@@ -3861,23 +3394,6 @@ static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3861 return 0; 3394 return 0;
3862} 3395}
3863 3396
3864/*
3865 * When dput() is called asynchronously, if umount has been done and
3866 * then deactivate_super() in cgroup_free_fn() kills the superblock,
3867 * there's a small window that vfs will see the root dentry with non-zero
3868 * refcnt and trigger BUG().
3869 *
3870 * That's why we hold a reference before dput() and drop it right after.
3871 */
3872static void cgroup_dput(struct cgroup *cgrp)
3873{
3874 struct super_block *sb = cgrp->root->sb;
3875
3876 atomic_inc(&sb->s_active);
3877 dput(cgrp->dentry);
3878 deactivate_super(sb);
3879}
3880
3881static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, 3397static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
3882 struct cftype *cft) 3398 struct cftype *cft)
3883{ 3399{
@@ -3944,7 +3460,7 @@ static struct cftype cgroup_base_files[] = {
3944 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 3460 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3945 .seq_show = cgroup_release_agent_show, 3461 .seq_show = cgroup_release_agent_show,
3946 .write_string = cgroup_release_agent_write, 3462 .write_string = cgroup_release_agent_write,
3947 .max_write_len = PATH_MAX, 3463 .max_write_len = PATH_MAX - 1,
3948 }, 3464 },
3949 { } /* terminate */ 3465 { } /* terminate */
3950}; 3466};
@@ -3963,13 +3479,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
3963 3479
3964 /* process cftsets of each subsystem */ 3480 /* process cftsets of each subsystem */
3965 for_each_subsys(ss, i) { 3481 for_each_subsys(ss, i) {
3966 struct cftype_set *set; 3482 struct cftype *cfts;
3967 3483
3968 if (!test_bit(i, &subsys_mask)) 3484 if (!test_bit(i, &subsys_mask))
3969 continue; 3485 continue;
3970 3486
3971 list_for_each_entry(set, &ss->cftsets, node) { 3487 list_for_each_entry(cfts, &ss->cfts, node) {
3972 ret = cgroup_addrm_files(cgrp, set->cfts, true); 3488 ret = cgroup_addrm_files(cgrp, cfts, true);
3973 if (ret < 0) 3489 if (ret < 0)
3974 goto err; 3490 goto err;
3975 } 3491 }
@@ -4012,7 +3528,7 @@ static void css_free_work_fn(struct work_struct *work)
4012 css_put(css->parent); 3528 css_put(css->parent);
4013 3529
4014 css->ss->css_free(css); 3530 css->ss->css_free(css);
4015 cgroup_dput(cgrp); 3531 cgroup_put(cgrp);
4016} 3532}
4017 3533
4018static void css_free_rcu_fn(struct rcu_head *rcu_head) 3534static void css_free_rcu_fn(struct rcu_head *rcu_head)
@@ -4020,10 +3536,6 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
4020 struct cgroup_subsys_state *css = 3536 struct cgroup_subsys_state *css =
4021 container_of(rcu_head, struct cgroup_subsys_state, rcu_head); 3537 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4022 3538
4023 /*
4024 * css holds an extra ref to @cgrp->dentry which is put on the last
4025 * css_put(). dput() requires process context which we don't have.
4026 */
4027 INIT_WORK(&css->destroy_work, css_free_work_fn); 3539 INIT_WORK(&css->destroy_work, css_free_work_fn);
4028 queue_work(cgroup_destroy_wq, &css->destroy_work); 3540 queue_work(cgroup_destroy_wq, &css->destroy_work);
4029} 3541}
@@ -4033,7 +3545,7 @@ static void css_release(struct percpu_ref *ref)
4033 struct cgroup_subsys_state *css = 3545 struct cgroup_subsys_state *css =
4034 container_of(ref, struct cgroup_subsys_state, refcnt); 3546 container_of(ref, struct cgroup_subsys_state, refcnt);
4035 3547
4036 rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL); 3548 RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);
4037 call_rcu(&css->rcu_head, css_free_rcu_fn); 3549 call_rcu(&css->rcu_head, css_free_rcu_fn);
4038} 3550}
4039 3551
@@ -4058,6 +3570,7 @@ static int online_css(struct cgroup_subsys_state *css)
4058 struct cgroup_subsys *ss = css->ss; 3570 struct cgroup_subsys *ss = css->ss;
4059 int ret = 0; 3571 int ret = 0;
4060 3572
3573 lockdep_assert_held(&cgroup_tree_mutex);
4061 lockdep_assert_held(&cgroup_mutex); 3574 lockdep_assert_held(&cgroup_mutex);
4062 3575
4063 if (ss->css_online) 3576 if (ss->css_online)
@@ -4065,7 +3578,7 @@ static int online_css(struct cgroup_subsys_state *css)
4065 if (!ret) { 3578 if (!ret) {
4066 css->flags |= CSS_ONLINE; 3579 css->flags |= CSS_ONLINE;
4067 css->cgroup->nr_css++; 3580 css->cgroup->nr_css++;
4068 rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); 3581 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
4069 } 3582 }
4070 return ret; 3583 return ret;
4071} 3584}
@@ -4075,6 +3588,7 @@ static void offline_css(struct cgroup_subsys_state *css)
4075{ 3588{
4076 struct cgroup_subsys *ss = css->ss; 3589 struct cgroup_subsys *ss = css->ss;
4077 3590
3591 lockdep_assert_held(&cgroup_tree_mutex);
4078 lockdep_assert_held(&cgroup_mutex); 3592 lockdep_assert_held(&cgroup_mutex);
4079 3593
4080 if (!(css->flags & CSS_ONLINE)) 3594 if (!(css->flags & CSS_ONLINE))
@@ -4085,7 +3599,7 @@ static void offline_css(struct cgroup_subsys_state *css)
4085 3599
4086 css->flags &= ~CSS_ONLINE; 3600 css->flags &= ~CSS_ONLINE;
4087 css->cgroup->nr_css--; 3601 css->cgroup->nr_css--;
4088 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); 3602 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);
4089} 3603}
4090 3604
4091/** 3605/**
@@ -4103,7 +3617,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4103 struct cgroup_subsys_state *css; 3617 struct cgroup_subsys_state *css;
4104 int err; 3618 int err;
4105 3619
4106 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
4107 lockdep_assert_held(&cgroup_mutex); 3620 lockdep_assert_held(&cgroup_mutex);
4108 3621
4109 css = ss->css_alloc(cgroup_css(parent, ss)); 3622 css = ss->css_alloc(cgroup_css(parent, ss));
@@ -4112,21 +3625,23 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4112 3625
4113 err = percpu_ref_init(&css->refcnt, css_release); 3626 err = percpu_ref_init(&css->refcnt, css_release);
4114 if (err) 3627 if (err)
4115 goto err_free; 3628 goto err_free_css;
4116 3629
4117 init_css(css, ss, cgrp); 3630 init_css(css, ss, cgrp);
4118 3631
4119 err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); 3632 err = cgroup_populate_dir(cgrp, 1 << ss->id);
4120 if (err) 3633 if (err)
4121 goto err_free; 3634 goto err_free_percpu_ref;
4122 3635
4123 err = online_css(css); 3636 err = online_css(css);
4124 if (err) 3637 if (err)
4125 goto err_free; 3638 goto err_clear_dir;
4126 3639
4127 dget(cgrp->dentry); 3640 cgroup_get(cgrp);
4128 css_get(css->parent); 3641 css_get(css->parent);
4129 3642
3643 cgrp->subsys_mask |= 1 << ss->id;
3644
4130 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 3645 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
4131 parent->parent) { 3646 parent->parent) {
4132 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", 3647 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
@@ -4138,41 +3653,43 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
4138 3653
4139 return 0; 3654 return 0;
4140 3655
4141err_free: 3656err_clear_dir:
3657 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
3658err_free_percpu_ref:
4142 percpu_ref_cancel_init(&css->refcnt); 3659 percpu_ref_cancel_init(&css->refcnt);
3660err_free_css:
4143 ss->css_free(css); 3661 ss->css_free(css);
4144 return err; 3662 return err;
4145} 3663}
4146 3664
4147/* 3665/**
4148 * cgroup_create - create a cgroup 3666 * cgroup_create - create a cgroup
4149 * @parent: cgroup that will be parent of the new cgroup 3667 * @parent: cgroup that will be parent of the new cgroup
4150 * @dentry: dentry of the new cgroup 3668 * @name: name of the new cgroup
4151 * @mode: mode to set on new inode 3669 * @mode: mode to set on new cgroup
4152 *
4153 * Must be called with the mutex on the parent inode held
4154 */ 3670 */
4155static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 3671static long cgroup_create(struct cgroup *parent, const char *name,
4156 umode_t mode) 3672 umode_t mode)
4157{ 3673{
4158 struct cgroup *cgrp; 3674 struct cgroup *cgrp;
4159 struct cgroup_name *name; 3675 struct cgroup_root *root = parent->root;
4160 struct cgroupfs_root *root = parent->root;
4161 int ssid, err; 3676 int ssid, err;
4162 struct cgroup_subsys *ss; 3677 struct cgroup_subsys *ss;
4163 struct super_block *sb = root->sb; 3678 struct kernfs_node *kn;
3679
3680 /*
3681 * XXX: The default hierarchy isn't fully implemented yet. Block
3682 * !root cgroup creation on it for now.
3683 */
3684 if (root == &cgrp_dfl_root)
3685 return -EINVAL;
4164 3686
4165 /* allocate the cgroup and its ID, 0 is reserved for the root */ 3687 /* allocate the cgroup and its ID, 0 is reserved for the root */
4166 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 3688 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
4167 if (!cgrp) 3689 if (!cgrp)
4168 return -ENOMEM; 3690 return -ENOMEM;
4169 3691
4170 name = cgroup_alloc_name(dentry); 3692 mutex_lock(&cgroup_tree_mutex);
4171 if (!name) {
4172 err = -ENOMEM;
4173 goto err_free_cgrp;
4174 }
4175 rcu_assign_pointer(cgrp->name, name);
4176 3693
4177 /* 3694 /*
4178 * Only live parents can have children. Note that the liveliness 3695 * Only live parents can have children. Note that the liveliness
@@ -4183,7 +3700,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4183 */ 3700 */
4184 if (!cgroup_lock_live_group(parent)) { 3701 if (!cgroup_lock_live_group(parent)) {
4185 err = -ENODEV; 3702 err = -ENODEV;
4186 goto err_free_name; 3703 goto err_unlock_tree;
4187 } 3704 }
4188 3705
4189 /* 3706 /*
@@ -4196,18 +3713,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4196 goto err_unlock; 3713 goto err_unlock;
4197 } 3714 }
4198 3715
4199 /* Grab a reference on the superblock so the hierarchy doesn't
4200 * get deleted on unmount if there are child cgroups. This
4201 * can be done outside cgroup_mutex, since the sb can't
4202 * disappear while someone has an open control file on the
4203 * fs */
4204 atomic_inc(&sb->s_active);
4205
4206 init_cgroup_housekeeping(cgrp); 3716 init_cgroup_housekeeping(cgrp);
4207 3717
4208 dentry->d_fsdata = cgrp;
4209 cgrp->dentry = dentry;
4210
4211 cgrp->parent = parent; 3718 cgrp->parent = parent;
4212 cgrp->dummy_css.parent = &parent->dummy_css; 3719 cgrp->dummy_css.parent = &parent->dummy_css;
4213 cgrp->root = parent->root; 3720 cgrp->root = parent->root;
@@ -4218,24 +3725,26 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4218 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 3725 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4219 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 3726 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4220 3727
3728 /* create the directory */
3729 kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
3730 if (IS_ERR(kn)) {
3731 err = PTR_ERR(kn);
3732 goto err_free_id;
3733 }
3734 cgrp->kn = kn;
3735
4221 /* 3736 /*
4222 * Create directory. cgroup_create_file() returns with the new 3737 * This extra ref will be put in cgroup_free_fn() and guarantees
4223 * directory locked on success so that it can be populated without 3738 * that @cgrp->kn is always accessible.
4224 * dropping cgroup_mutex.
4225 */ 3739 */
4226 err = cgroup_create_file(dentry, S_IFDIR | mode, sb); 3740 kernfs_get(kn);
4227 if (err < 0)
4228 goto err_free_id;
4229 lockdep_assert_held(&dentry->d_inode->i_mutex);
4230 3741
4231 cgrp->serial_nr = cgroup_serial_nr_next++; 3742 cgrp->serial_nr = cgroup_serial_nr_next++;
4232 3743
4233 /* allocation complete, commit to creation */ 3744 /* allocation complete, commit to creation */
4234 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 3745 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4235 root->number_of_cgroups++; 3746 atomic_inc(&root->nr_cgrps);
4236 3747 cgroup_get(parent);
4237 /* hold a ref to the parent's dentry */
4238 dget(parent->dentry);
4239 3748
4240 /* 3749 /*
4241 * @cgrp is now fully operational. If something fails after this 3750 * @cgrp is now fully operational. If something fails after this
@@ -4249,43 +3758,56 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4249 3758
4250 /* let's create and online css's */ 3759 /* let's create and online css's */
4251 for_each_subsys(ss, ssid) { 3760 for_each_subsys(ss, ssid) {
4252 if (root->subsys_mask & (1 << ssid)) { 3761 if (root->cgrp.subsys_mask & (1 << ssid)) {
4253 err = create_css(cgrp, ss); 3762 err = create_css(cgrp, ss);
4254 if (err) 3763 if (err)
4255 goto err_destroy; 3764 goto err_destroy;
4256 } 3765 }
4257 } 3766 }
4258 3767
3768 kernfs_activate(kn);
3769
4259 mutex_unlock(&cgroup_mutex); 3770 mutex_unlock(&cgroup_mutex);
4260 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 3771 mutex_unlock(&cgroup_tree_mutex);
4261 3772
4262 return 0; 3773 return 0;
4263 3774
4264err_free_id: 3775err_free_id:
4265 idr_remove(&root->cgroup_idr, cgrp->id); 3776 idr_remove(&root->cgroup_idr, cgrp->id);
4266 /* Release the reference count that we took on the superblock */
4267 deactivate_super(sb);
4268err_unlock: 3777err_unlock:
4269 mutex_unlock(&cgroup_mutex); 3778 mutex_unlock(&cgroup_mutex);
4270err_free_name: 3779err_unlock_tree:
4271 kfree(rcu_dereference_raw(cgrp->name)); 3780 mutex_unlock(&cgroup_tree_mutex);
4272err_free_cgrp:
4273 kfree(cgrp); 3781 kfree(cgrp);
4274 return err; 3782 return err;
4275 3783
4276err_destroy: 3784err_destroy:
4277 cgroup_destroy_locked(cgrp); 3785 cgroup_destroy_locked(cgrp);
4278 mutex_unlock(&cgroup_mutex); 3786 mutex_unlock(&cgroup_mutex);
4279 mutex_unlock(&dentry->d_inode->i_mutex); 3787 mutex_unlock(&cgroup_tree_mutex);
4280 return err; 3788 return err;
4281} 3789}
4282 3790
4283static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 3791static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3792 umode_t mode)
4284{ 3793{
4285 struct cgroup *c_parent = dentry->d_parent->d_fsdata; 3794 struct cgroup *parent = parent_kn->priv;
3795 int ret;
3796
3797 /*
3798 * cgroup_create() grabs cgroup_tree_mutex which nests outside
3799 * kernfs active_ref and cgroup_create() already synchronizes
3800 * properly against removal through cgroup_lock_live_group().
3801 * Break it before calling cgroup_create().
3802 */
3803 cgroup_get(parent);
3804 kernfs_break_active_protection(parent_kn);
3805
3806 ret = cgroup_create(parent, name, mode);
4286 3807
4287 /* the vfs holds inode->i_mutex already */ 3808 kernfs_unbreak_active_protection(parent_kn);
4288 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 3809 cgroup_put(parent);
3810 return ret;
4289} 3811}
4290 3812
4291/* 3813/*
@@ -4298,6 +3820,7 @@ static void css_killed_work_fn(struct work_struct *work)
4298 container_of(work, struct cgroup_subsys_state, destroy_work); 3820 container_of(work, struct cgroup_subsys_state, destroy_work);
4299 struct cgroup *cgrp = css->cgroup; 3821 struct cgroup *cgrp = css->cgroup;
4300 3822
3823 mutex_lock(&cgroup_tree_mutex);
4301 mutex_lock(&cgroup_mutex); 3824 mutex_lock(&cgroup_mutex);
4302 3825
4303 /* 3826 /*
@@ -4315,6 +3838,7 @@ static void css_killed_work_fn(struct work_struct *work)
4315 cgroup_destroy_css_killed(cgrp); 3838 cgroup_destroy_css_killed(cgrp);
4316 3839
4317 mutex_unlock(&cgroup_mutex); 3840 mutex_unlock(&cgroup_mutex);
3841 mutex_unlock(&cgroup_tree_mutex);
4318 3842
4319 /* 3843 /*
4320 * Put the css refs from kill_css(). Each css holds an extra 3844 * Put the css refs from kill_css(). Each css holds an extra
@@ -4336,18 +3860,15 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
4336 queue_work(cgroup_destroy_wq, &css->destroy_work); 3860 queue_work(cgroup_destroy_wq, &css->destroy_work);
4337} 3861}
4338 3862
4339/** 3863static void __kill_css(struct cgroup_subsys_state *css)
4340 * kill_css - destroy a css
4341 * @css: css to destroy
4342 *
4343 * This function initiates destruction of @css by removing cgroup interface
4344 * files and putting its base reference. ->css_offline() will be invoked
4345 * asynchronously once css_tryget() is guaranteed to fail and when the
4346 * reference count reaches zero, @css will be released.
4347 */
4348static void kill_css(struct cgroup_subsys_state *css)
4349{ 3864{
4350 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); 3865 lockdep_assert_held(&cgroup_tree_mutex);
3866
3867 /*
3868 * This must happen before css is disassociated with its cgroup.
3869 * See seq_css() for details.
3870 */
3871 cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
4351 3872
4352 /* 3873 /*
4353 * Killing would put the base ref, but we need to keep it alive 3874 * Killing would put the base ref, but we need to keep it alive
@@ -4369,6 +3890,28 @@ static void kill_css(struct cgroup_subsys_state *css)
4369} 3890}
4370 3891
4371/** 3892/**
3893 * kill_css - destroy a css
3894 * @css: css to destroy
3895 *
3896 * This function initiates destruction of @css by removing cgroup interface
3897 * files and putting its base reference. ->css_offline() will be invoked
3898 * asynchronously once css_tryget() is guaranteed to fail and when the
3899 * reference count reaches zero, @css will be released.
3900 */
3901static void kill_css(struct cgroup_subsys_state *css)
3902{
3903 struct cgroup *cgrp = css->cgroup;
3904
3905 lockdep_assert_held(&cgroup_tree_mutex);
3906
3907 /* if already killed, noop */
3908 if (cgrp->subsys_mask & (1 << css->ss->id)) {
3909 cgrp->subsys_mask &= ~(1 << css->ss->id);
3910 __kill_css(css);
3911 }
3912}
3913
3914/**
4372 * cgroup_destroy_locked - the first stage of cgroup destruction 3915 * cgroup_destroy_locked - the first stage of cgroup destruction
4373 * @cgrp: cgroup to be destroyed 3916 * @cgrp: cgroup to be destroyed
4374 * 3917 *
@@ -4395,22 +3938,21 @@ static void kill_css(struct cgroup_subsys_state *css)
4395static int cgroup_destroy_locked(struct cgroup *cgrp) 3938static int cgroup_destroy_locked(struct cgroup *cgrp)
4396 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 3939 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4397{ 3940{
4398 struct dentry *d = cgrp->dentry;
4399 struct cgroup_subsys_state *css;
4400 struct cgroup *child; 3941 struct cgroup *child;
3942 struct cgroup_subsys_state *css;
4401 bool empty; 3943 bool empty;
4402 int ssid; 3944 int ssid;
4403 3945
4404 lockdep_assert_held(&d->d_inode->i_mutex); 3946 lockdep_assert_held(&cgroup_tree_mutex);
4405 lockdep_assert_held(&cgroup_mutex); 3947 lockdep_assert_held(&cgroup_mutex);
4406 3948
4407 /* 3949 /*
4408 * css_set_lock synchronizes access to ->cset_links and prevents 3950 * css_set_rwsem synchronizes access to ->cset_links and prevents
4409 * @cgrp from being removed while __put_css_set() is in progress. 3951 * @cgrp from being removed while put_css_set() is in progress.
4410 */ 3952 */
4411 read_lock(&css_set_lock); 3953 down_read(&css_set_rwsem);
4412 empty = list_empty(&cgrp->cset_links); 3954 empty = list_empty(&cgrp->cset_links);
4413 read_unlock(&css_set_lock); 3955 up_read(&css_set_rwsem);
4414 if (!empty) 3956 if (!empty)
4415 return -EBUSY; 3957 return -EBUSY;
4416 3958
@@ -4431,14 +3973,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4431 return -EBUSY; 3973 return -EBUSY;
4432 3974
4433 /* 3975 /*
4434 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4435 * will be invoked to perform the rest of destruction once the
4436 * percpu refs of all css's are confirmed to be killed.
4437 */
4438 for_each_css(css, ssid, cgrp)
4439 kill_css(css);
4440
4441 /*
4442 * Mark @cgrp dead. This prevents further task migration and child 3976 * Mark @cgrp dead. This prevents further task migration and child
4443 * creation by disabling cgroup_lock_live_group(). Note that 3977 * creation by disabling cgroup_lock_live_group(). Note that
4444 * CGRP_DEAD assertion is depended upon by css_next_child() to 3978 * CGRP_DEAD assertion is depended upon by css_next_child() to
@@ -4447,6 +3981,17 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4447 */ 3981 */
4448 set_bit(CGRP_DEAD, &cgrp->flags); 3982 set_bit(CGRP_DEAD, &cgrp->flags);
4449 3983
3984 /*
3985 * Initiate massacre of all css's. cgroup_destroy_css_killed()
3986 * will be invoked to perform the rest of destruction once the
3987 * percpu refs of all css's are confirmed to be killed. This
3988 * involves removing the subsystem's files, drop cgroup_mutex.
3989 */
3990 mutex_unlock(&cgroup_mutex);
3991 for_each_css(css, ssid, cgrp)
3992 kill_css(css);
3993 mutex_lock(&cgroup_mutex);
3994
4450 /* CGRP_DEAD is set, remove from ->release_list for the last time */ 3995 /* CGRP_DEAD is set, remove from ->release_list for the last time */
4451 raw_spin_lock(&release_list_lock); 3996 raw_spin_lock(&release_list_lock);
4452 if (!list_empty(&cgrp->release_list)) 3997 if (!list_empty(&cgrp->release_list))
@@ -4462,14 +4007,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4462 if (!cgrp->nr_css) 4007 if (!cgrp->nr_css)
4463 cgroup_destroy_css_killed(cgrp); 4008 cgroup_destroy_css_killed(cgrp);
4464 4009
4010 /* remove @cgrp directory along with the base files */
4011 mutex_unlock(&cgroup_mutex);
4012
4465 /* 4013 /*
4466 * Clear the base files and remove @cgrp directory. The removal 4014 * There are two control paths which try to determine cgroup from
4467 * puts the base ref but we aren't quite done with @cgrp yet, so 4015 * dentry without going through kernfs - cgroupstats_build() and
4468 * hold onto it. 4016 * css_tryget_from_dir(). Those are supported by RCU protecting
4017 * clearing of cgrp->kn->priv backpointer, which should happen
4018 * after all files under it have been removed.
4469 */ 4019 */
4470 cgroup_addrm_files(cgrp, cgroup_base_files, false); 4020 kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */
4471 dget(d); 4021 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
4472 cgroup_d_remove_dir(d); 4022
4023 mutex_lock(&cgroup_mutex);
4473 4024
4474 return 0; 4025 return 0;
4475}; 4026};
@@ -4486,72 +4037,82 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4486static void cgroup_destroy_css_killed(struct cgroup *cgrp) 4037static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4487{ 4038{
4488 struct cgroup *parent = cgrp->parent; 4039 struct cgroup *parent = cgrp->parent;
4489 struct dentry *d = cgrp->dentry;
4490 4040
4041 lockdep_assert_held(&cgroup_tree_mutex);
4491 lockdep_assert_held(&cgroup_mutex); 4042 lockdep_assert_held(&cgroup_mutex);
4492 4043
4493 /* delete this cgroup from parent->children */ 4044 /* delete this cgroup from parent->children */
4494 list_del_rcu(&cgrp->sibling); 4045 list_del_rcu(&cgrp->sibling);
4495 4046
4496 dput(d); 4047 cgroup_put(cgrp);
4497 4048
4498 set_bit(CGRP_RELEASABLE, &parent->flags); 4049 set_bit(CGRP_RELEASABLE, &parent->flags);
4499 check_for_release(parent); 4050 check_for_release(parent);
4500} 4051}
4501 4052
4502static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4053static int cgroup_rmdir(struct kernfs_node *kn)
4503{ 4054{
4504 int ret; 4055 struct cgroup *cgrp = kn->priv;
4505 4056 int ret = 0;
4506 mutex_lock(&cgroup_mutex);
4507 ret = cgroup_destroy_locked(dentry->d_fsdata);
4508 mutex_unlock(&cgroup_mutex);
4509 4057
4510 return ret; 4058 /*
4511} 4059 * This is self-destruction but @kn can't be removed while this
4060 * callback is in progress. Let's break active protection. Once
4061 * the protection is broken, @cgrp can be destroyed at any point.
4062 * Pin it so that it stays accessible.
4063 */
4064 cgroup_get(cgrp);
4065 kernfs_break_active_protection(kn);
4512 4066
4513static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) 4067 mutex_lock(&cgroup_tree_mutex);
4514{ 4068 mutex_lock(&cgroup_mutex);
4515 INIT_LIST_HEAD(&ss->cftsets);
4516 4069
4517 /* 4070 /*
4518 * base_cftset is embedded in subsys itself, no need to worry about 4071 * @cgrp might already have been destroyed while we're trying to
4519 * deregistration. 4072 * grab the mutexes.
4520 */ 4073 */
4521 if (ss->base_cftypes) { 4074 if (!cgroup_is_dead(cgrp))
4522 struct cftype *cft; 4075 ret = cgroup_destroy_locked(cgrp);
4523 4076
4524 for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) 4077 mutex_unlock(&cgroup_mutex);
4525 cft->ss = ss; 4078 mutex_unlock(&cgroup_tree_mutex);
4526 4079
4527 ss->base_cftset.cfts = ss->base_cftypes; 4080 kernfs_unbreak_active_protection(kn);
4528 list_add_tail(&ss->base_cftset.node, &ss->cftsets); 4081 cgroup_put(cgrp);
4529 } 4082 return ret;
4530} 4083}
4531 4084
4085static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4086 .remount_fs = cgroup_remount,
4087 .show_options = cgroup_show_options,
4088 .mkdir = cgroup_mkdir,
4089 .rmdir = cgroup_rmdir,
4090 .rename = cgroup_rename,
4091};
4092
4532static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4093static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4533{ 4094{
4534 struct cgroup_subsys_state *css; 4095 struct cgroup_subsys_state *css;
4535 4096
4536 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4097 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4537 4098
4099 mutex_lock(&cgroup_tree_mutex);
4538 mutex_lock(&cgroup_mutex); 4100 mutex_lock(&cgroup_mutex);
4539 4101
4540 /* init base cftset */ 4102 INIT_LIST_HEAD(&ss->cfts);
4541 cgroup_init_cftsets(ss);
4542 4103
4543 /* Create the top cgroup state for this subsystem */ 4104 /* Create the root cgroup state for this subsystem */
4544 ss->root = &cgroup_dummy_root; 4105 ss->root = &cgrp_dfl_root;
4545 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); 4106 css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4546 /* We don't handle early failures gracefully */ 4107 /* We don't handle early failures gracefully */
4547 BUG_ON(IS_ERR(css)); 4108 BUG_ON(IS_ERR(css));
4548 init_css(css, ss, cgroup_dummy_top); 4109 init_css(css, ss, &cgrp_dfl_root.cgrp);
4549 4110
4550 /* Update the init_css_set to contain a subsys 4111 /* Update the init_css_set to contain a subsys
4551 * pointer to this state - since the subsystem is 4112 * pointer to this state - since the subsystem is
4552 * newly registered, all tasks and hence the 4113 * newly registered, all tasks and hence the
4553 * init_css_set is in the subsystem's top cgroup. */ 4114 * init_css_set is in the subsystem's root cgroup. */
4554 init_css_set.subsys[ss->subsys_id] = css; 4115 init_css_set.subsys[ss->id] = css;
4555 4116
4556 need_forkexit_callback |= ss->fork || ss->exit; 4117 need_forkexit_callback |= ss->fork || ss->exit;
4557 4118
@@ -4562,185 +4123,11 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4562 4123
4563 BUG_ON(online_css(css)); 4124 BUG_ON(online_css(css));
4564 4125
4565 mutex_unlock(&cgroup_mutex); 4126 cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;
4566
4567 /* this function shouldn't be used with modular subsystems, since they
4568 * need to register a subsys_id, among other things */
4569 BUG_ON(ss->module);
4570}
4571
4572/**
4573 * cgroup_load_subsys: load and register a modular subsystem at runtime
4574 * @ss: the subsystem to load
4575 *
4576 * This function should be called in a modular subsystem's initcall. If the
4577 * subsystem is built as a module, it will be assigned a new subsys_id and set
4578 * up for use. If the subsystem is built-in anyway, work is delegated to the
4579 * simpler cgroup_init_subsys.
4580 */
4581int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4582{
4583 struct cgroup_subsys_state *css;
4584 int i, ret;
4585 struct hlist_node *tmp;
4586 struct css_set *cset;
4587 unsigned long key;
4588
4589 /* check name and function validity */
4590 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
4591 ss->css_alloc == NULL || ss->css_free == NULL)
4592 return -EINVAL;
4593
4594 /*
4595 * we don't support callbacks in modular subsystems. this check is
4596 * before the ss->module check for consistency; a subsystem that could
4597 * be a module should still have no callbacks even if the user isn't
4598 * compiling it as one.
4599 */
4600 if (ss->fork || ss->exit)
4601 return -EINVAL;
4602
4603 /*
4604 * an optionally modular subsystem is built-in: we want to do nothing,
4605 * since cgroup_init_subsys will have already taken care of it.
4606 */
4607 if (ss->module == NULL) {
4608 /* a sanity check */
4609 BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
4610 return 0;
4611 }
4612
4613 /* init base cftset */
4614 cgroup_init_cftsets(ss);
4615
4616 mutex_lock(&cgroup_mutex);
4617 mutex_lock(&cgroup_root_mutex);
4618 cgroup_subsys[ss->subsys_id] = ss;
4619
4620 /*
4621 * no ss->css_alloc seems to need anything important in the ss
4622 * struct, so this can happen first (i.e. before the dummy root
4623 * attachment).
4624 */
4625 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4626 if (IS_ERR(css)) {
4627 /* failure case - need to deassign the cgroup_subsys[] slot. */
4628 cgroup_subsys[ss->subsys_id] = NULL;
4629 mutex_unlock(&cgroup_root_mutex);
4630 mutex_unlock(&cgroup_mutex);
4631 return PTR_ERR(css);
4632 }
4633
4634 ss->root = &cgroup_dummy_root;
4635
4636 /* our new subsystem will be attached to the dummy hierarchy. */
4637 init_css(css, ss, cgroup_dummy_top);
4638
4639 /*
4640 * Now we need to entangle the css into the existing css_sets. unlike
4641 * in cgroup_init_subsys, there are now multiple css_sets, so each one
4642 * will need a new pointer to it; done by iterating the css_set_table.
4643 * furthermore, modifying the existing css_sets will corrupt the hash
4644 * table state, so each changed css_set will need its hash recomputed.
4645 * this is all done under the css_set_lock.
4646 */
4647 write_lock(&css_set_lock);
4648 hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
4649 /* skip entries that we already rehashed */
4650 if (cset->subsys[ss->subsys_id])
4651 continue;
4652 /* remove existing entry */
4653 hash_del(&cset->hlist);
4654 /* set new value */
4655 cset->subsys[ss->subsys_id] = css;
4656 /* recompute hash and restore entry */
4657 key = css_set_hash(cset->subsys);
4658 hash_add(css_set_table, &cset->hlist, key);
4659 }
4660 write_unlock(&css_set_lock);
4661
4662 ret = online_css(css);
4663 if (ret) {
4664 ss->css_free(css);
4665 goto err_unload;
4666 }
4667
4668 /* success! */
4669 mutex_unlock(&cgroup_root_mutex);
4670 mutex_unlock(&cgroup_mutex);
4671 return 0;
4672
4673err_unload:
4674 mutex_unlock(&cgroup_root_mutex);
4675 mutex_unlock(&cgroup_mutex);
4676 /* @ss can't be mounted here as try_module_get() would fail */
4677 cgroup_unload_subsys(ss);
4678 return ret;
4679}
4680EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4681
4682/**
4683 * cgroup_unload_subsys: unload a modular subsystem
4684 * @ss: the subsystem to unload
4685 *
4686 * This function should be called in a modular subsystem's exitcall. When this
4687 * function is invoked, the refcount on the subsystem's module will be 0, so
4688 * the subsystem will not be attached to any hierarchy.
4689 */
4690void cgroup_unload_subsys(struct cgroup_subsys *ss)
4691{
4692 struct cgrp_cset_link *link;
4693 struct cgroup_subsys_state *css;
4694
4695 BUG_ON(ss->module == NULL);
4696
4697 /*
4698 * we shouldn't be called if the subsystem is in use, and the use of
4699 * try_module_get() in rebind_subsystems() should ensure that it
4700 * doesn't start being used while we're killing it off.
4701 */
4702 BUG_ON(ss->root != &cgroup_dummy_root);
4703
4704 mutex_lock(&cgroup_mutex);
4705 mutex_lock(&cgroup_root_mutex);
4706
4707 css = cgroup_css(cgroup_dummy_top, ss);
4708 if (css)
4709 offline_css(css);
4710 4127
4711 /* deassign the subsys_id */
4712 cgroup_subsys[ss->subsys_id] = NULL;
4713
4714 /*
4715 * disentangle the css from all css_sets attached to the dummy
4716 * top. as in loading, we need to pay our respects to the hashtable
4717 * gods.
4718 */
4719 write_lock(&css_set_lock);
4720 list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
4721 struct css_set *cset = link->cset;
4722 unsigned long key;
4723
4724 hash_del(&cset->hlist);
4725 cset->subsys[ss->subsys_id] = NULL;
4726 key = css_set_hash(cset->subsys);
4727 hash_add(css_set_table, &cset->hlist, key);
4728 }
4729 write_unlock(&css_set_lock);
4730
4731 /*
4732 * remove subsystem's css from the cgroup_dummy_top and free it -
4733 * need to free before marking as null because ss->css_free needs
4734 * the cgrp->subsys pointer to find their state.
4735 */
4736 if (css)
4737 ss->css_free(css);
4738 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4739
4740 mutex_unlock(&cgroup_root_mutex);
4741 mutex_unlock(&cgroup_mutex); 4128 mutex_unlock(&cgroup_mutex);
4129 mutex_unlock(&cgroup_tree_mutex);
4742} 4130}
4743EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4744 4131
4745/** 4132/**
4746 * cgroup_init_early - cgroup initialization at system boot 4133 * cgroup_init_early - cgroup initialization at system boot
@@ -4750,34 +4137,24 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4750 */ 4137 */
4751int __init cgroup_init_early(void) 4138int __init cgroup_init_early(void)
4752{ 4139{
4140 static struct cgroup_sb_opts __initdata opts =
4141 { .flags = CGRP_ROOT_SANE_BEHAVIOR };
4753 struct cgroup_subsys *ss; 4142 struct cgroup_subsys *ss;
4754 int i; 4143 int i;
4755 4144
4756 atomic_set(&init_css_set.refcount, 1); 4145 init_cgroup_root(&cgrp_dfl_root, &opts);
4757 INIT_LIST_HEAD(&init_css_set.cgrp_links);
4758 INIT_LIST_HEAD(&init_css_set.tasks);
4759 INIT_HLIST_NODE(&init_css_set.hlist);
4760 css_set_count = 1;
4761 init_cgroup_root(&cgroup_dummy_root);
4762 cgroup_root_count = 1;
4763 RCU_INIT_POINTER(init_task.cgroups, &init_css_set); 4146 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4764 4147
4765 init_cgrp_cset_link.cset = &init_css_set; 4148 for_each_subsys(ss, i) {
4766 init_cgrp_cset_link.cgrp = cgroup_dummy_top; 4149 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
4767 list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); 4150 "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
4768 list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); 4151 i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
4769 4152 ss->id, ss->name);
4770 /* at bootup time, we don't worry about modular subsystems */ 4153 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
4771 for_each_builtin_subsys(ss, i) { 4154 "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
4772 BUG_ON(!ss->name); 4155
4773 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4156 ss->id = i;
4774 BUG_ON(!ss->css_alloc); 4157 ss->name = cgroup_subsys_name[i];
4775 BUG_ON(!ss->css_free);
4776 if (ss->subsys_id != i) {
4777 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
4778 ss->name, ss->subsys_id);
4779 BUG();
4780 }
4781 4158
4782 if (ss->early_init) 4159 if (ss->early_init)
4783 cgroup_init_subsys(ss); 4160 cgroup_init_subsys(ss);
@@ -4795,53 +4172,46 @@ int __init cgroup_init(void)
4795{ 4172{
4796 struct cgroup_subsys *ss; 4173 struct cgroup_subsys *ss;
4797 unsigned long key; 4174 unsigned long key;
4798 int i, err; 4175 int ssid, err;
4799 4176
4800 err = bdi_init(&cgroup_backing_dev_info); 4177 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4801 if (err)
4802 return err;
4803 4178
4804 for_each_builtin_subsys(ss, i) { 4179 mutex_lock(&cgroup_tree_mutex);
4805 if (!ss->early_init)
4806 cgroup_init_subsys(ss);
4807 }
4808
4809 /* allocate id for the dummy hierarchy */
4810 mutex_lock(&cgroup_mutex); 4180 mutex_lock(&cgroup_mutex);
4811 mutex_lock(&cgroup_root_mutex);
4812 4181
4813 /* Add init_css_set to the hash table */ 4182 /* Add init_css_set to the hash table */
4814 key = css_set_hash(init_css_set.subsys); 4183 key = css_set_hash(init_css_set.subsys);
4815 hash_add(css_set_table, &init_css_set.hlist, key); 4184 hash_add(css_set_table, &init_css_set.hlist, key);
4816 4185
4817 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); 4186 BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
4818 4187
4819 err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
4820 0, 1, GFP_KERNEL);
4821 BUG_ON(err < 0);
4822
4823 mutex_unlock(&cgroup_root_mutex);
4824 mutex_unlock(&cgroup_mutex); 4188 mutex_unlock(&cgroup_mutex);
4189 mutex_unlock(&cgroup_tree_mutex);
4825 4190
4826 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4191 for_each_subsys(ss, ssid) {
4827 if (!cgroup_kobj) { 4192 if (!ss->early_init)
4828 err = -ENOMEM; 4193 cgroup_init_subsys(ss);
4829 goto out; 4194
4195 /*
4196 * cftype registration needs kmalloc and can't be done
4197 * during early_init. Register base cftypes separately.
4198 */
4199 if (ss->base_cftypes)
4200 WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
4830 } 4201 }
4831 4202
4203 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4204 if (!cgroup_kobj)
4205 return -ENOMEM;
4206
4832 err = register_filesystem(&cgroup_fs_type); 4207 err = register_filesystem(&cgroup_fs_type);
4833 if (err < 0) { 4208 if (err < 0) {
4834 kobject_put(cgroup_kobj); 4209 kobject_put(cgroup_kobj);
4835 goto out; 4210 return err;
4836 } 4211 }
4837 4212
4838 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); 4213 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
4839 4214 return 0;
4840out:
4841 if (err)
4842 bdi_destroy(&cgroup_backing_dev_info);
4843
4844 return err;
4845} 4215}
4846 4216
4847static int __init cgroup_wq_init(void) 4217static int __init cgroup_wq_init(void)
@@ -4873,12 +4243,6 @@ core_initcall(cgroup_wq_init);
4873 * proc_cgroup_show() 4243 * proc_cgroup_show()
4874 * - Print task's cgroup paths into seq_file, one line for each hierarchy 4244 * - Print task's cgroup paths into seq_file, one line for each hierarchy
4875 * - Used for /proc/<pid>/cgroup. 4245 * - Used for /proc/<pid>/cgroup.
4876 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
4877 * doesn't really matter if tsk->cgroup changes after we read it,
4878 * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
4879 * anyway. No need to check that tsk->cgroup != NULL, thanks to
4880 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
4881 * cgroup to top_cgroup.
4882 */ 4246 */
4883 4247
4884/* TODO: Use a proper seq_file iterator */ 4248/* TODO: Use a proper seq_file iterator */
@@ -4886,12 +4250,12 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4886{ 4250{
4887 struct pid *pid; 4251 struct pid *pid;
4888 struct task_struct *tsk; 4252 struct task_struct *tsk;
4889 char *buf; 4253 char *buf, *path;
4890 int retval; 4254 int retval;
4891 struct cgroupfs_root *root; 4255 struct cgroup_root *root;
4892 4256
4893 retval = -ENOMEM; 4257 retval = -ENOMEM;
4894 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4258 buf = kmalloc(PATH_MAX, GFP_KERNEL);
4895 if (!buf) 4259 if (!buf)
4896 goto out; 4260 goto out;
4897 4261
@@ -4904,29 +4268,36 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4904 retval = 0; 4268 retval = 0;
4905 4269
4906 mutex_lock(&cgroup_mutex); 4270 mutex_lock(&cgroup_mutex);
4271 down_read(&css_set_rwsem);
4907 4272
4908 for_each_active_root(root) { 4273 for_each_root(root) {
4909 struct cgroup_subsys *ss; 4274 struct cgroup_subsys *ss;
4910 struct cgroup *cgrp; 4275 struct cgroup *cgrp;
4911 int ssid, count = 0; 4276 int ssid, count = 0;
4912 4277
4278 if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
4279 continue;
4280
4913 seq_printf(m, "%d:", root->hierarchy_id); 4281 seq_printf(m, "%d:", root->hierarchy_id);
4914 for_each_subsys(ss, ssid) 4282 for_each_subsys(ss, ssid)
4915 if (root->subsys_mask & (1 << ssid)) 4283 if (root->cgrp.subsys_mask & (1 << ssid))
4916 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4284 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4917 if (strlen(root->name)) 4285 if (strlen(root->name))
4918 seq_printf(m, "%sname=%s", count ? "," : "", 4286 seq_printf(m, "%sname=%s", count ? "," : "",
4919 root->name); 4287 root->name);
4920 seq_putc(m, ':'); 4288 seq_putc(m, ':');
4921 cgrp = task_cgroup_from_root(tsk, root); 4289 cgrp = task_cgroup_from_root(tsk, root);
4922 retval = cgroup_path(cgrp, buf, PAGE_SIZE); 4290 path = cgroup_path(cgrp, buf, PATH_MAX);
4923 if (retval < 0) 4291 if (!path) {
4292 retval = -ENAMETOOLONG;
4924 goto out_unlock; 4293 goto out_unlock;
4925 seq_puts(m, buf); 4294 }
4295 seq_puts(m, path);
4926 seq_putc(m, '\n'); 4296 seq_putc(m, '\n');
4927 } 4297 }
4928 4298
4929out_unlock: 4299out_unlock:
4300 up_read(&css_set_rwsem);
4930 mutex_unlock(&cgroup_mutex); 4301 mutex_unlock(&cgroup_mutex);
4931 put_task_struct(tsk); 4302 put_task_struct(tsk);
4932out_free: 4303out_free:
@@ -4952,7 +4323,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
4952 for_each_subsys(ss, i) 4323 for_each_subsys(ss, i)
4953 seq_printf(m, "%s\t%d\t%d\t%d\n", 4324 seq_printf(m, "%s\t%d\t%d\t%d\n",
4954 ss->name, ss->root->hierarchy_id, 4325 ss->name, ss->root->hierarchy_id,
4955 ss->root->number_of_cgroups, !ss->disabled); 4326 atomic_read(&ss->root->nr_cgrps), !ss->disabled);
4956 4327
4957 mutex_unlock(&cgroup_mutex); 4328 mutex_unlock(&cgroup_mutex);
4958 return 0; 4329 return 0;
@@ -4971,27 +4342,16 @@ static const struct file_operations proc_cgroupstats_operations = {
4971}; 4342};
4972 4343
4973/** 4344/**
4974 * cgroup_fork - attach newly forked task to its parents cgroup. 4345 * cgroup_fork - initialize cgroup related fields during copy_process()
4975 * @child: pointer to task_struct of forking parent process. 4346 * @child: pointer to task_struct of forking parent process.
4976 * 4347 *
4977 * Description: A task inherits its parent's cgroup at fork(). 4348 * A task is associated with the init_css_set until cgroup_post_fork()
4978 * 4349 * attaches it to the parent's css_set. Empty cg_list indicates that
4979 * A pointer to the shared css_set was automatically copied in 4350 * @child isn't holding reference to its css_set.
4980 * fork.c by dup_task_struct(). However, we ignore that copy, since
4981 * it was not made under the protection of RCU or cgroup_mutex, so
4982 * might no longer be a valid cgroup pointer. cgroup_attach_task() might
4983 * have already changed current->cgroups, allowing the previously
4984 * referenced cgroup group to be removed and freed.
4985 *
4986 * At the point that cgroup_fork() is called, 'current' is the parent
4987 * task, and the passed argument 'child' points to the child task.
4988 */ 4351 */
4989void cgroup_fork(struct task_struct *child) 4352void cgroup_fork(struct task_struct *child)
4990{ 4353{
4991 task_lock(current); 4354 RCU_INIT_POINTER(child->cgroups, &init_css_set);
4992 get_css_set(task_css_set(current));
4993 child->cgroups = current->cgroups;
4994 task_unlock(current);
4995 INIT_LIST_HEAD(&child->cg_list); 4355 INIT_LIST_HEAD(&child->cg_list);
4996} 4356}
4997 4357
@@ -5011,23 +4371,37 @@ void cgroup_post_fork(struct task_struct *child)
5011 int i; 4371 int i;
5012 4372
5013 /* 4373 /*
5014 * use_task_css_set_links is set to 1 before we walk the tasklist 4374 * This may race against cgroup_enable_task_cg_links(). As that
5015 * under the tasklist_lock and we read it here after we added the child 4375 * function sets use_task_css_set_links before grabbing
5016 * to the tasklist under the tasklist_lock as well. If the child wasn't 4376 * tasklist_lock and we just went through tasklist_lock to add
5017 * yet in the tasklist when we walked through it from 4377 * @child, it's guaranteed that either we see the set
5018 * cgroup_enable_task_cg_lists(), then use_task_css_set_links value 4378 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
5019 * should be visible now due to the paired locking and barriers implied 4379 * @child during its iteration.
5020 * by LOCK/UNLOCK: it is written before the tasklist_lock unlock 4380 *
5021 * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock 4381 * If we won the race, @child is associated with %current's
5022 * lock on fork. 4382 * css_set. Grabbing css_set_rwsem guarantees both that the
4383 * association is stable, and, on completion of the parent's
4384 * migration, @child is visible in the source of migration or
4385 * already in the destination cgroup. This guarantee is necessary
4386 * when implementing operations which need to migrate all tasks of
4387 * a cgroup to another.
4388 *
4389 * Note that if we lose to cgroup_enable_task_cg_links(), @child
4390 * will remain in init_css_set. This is safe because all tasks are
4391 * in the init_css_set before cg_links is enabled and there's no
4392 * operation which transfers all tasks out of init_css_set.
5023 */ 4393 */
5024 if (use_task_css_set_links) { 4394 if (use_task_css_set_links) {
5025 write_lock(&css_set_lock); 4395 struct css_set *cset;
5026 task_lock(child); 4396
5027 if (list_empty(&child->cg_list)) 4397 down_write(&css_set_rwsem);
5028 list_add(&child->cg_list, &task_css_set(child)->tasks); 4398 cset = task_css_set(current);
5029 task_unlock(child); 4399 if (list_empty(&child->cg_list)) {
5030 write_unlock(&css_set_lock); 4400 rcu_assign_pointer(child->cgroups, cset);
4401 list_add(&child->cg_list, &cset->tasks);
4402 get_css_set(cset);
4403 }
4404 up_write(&css_set_rwsem);
5031 } 4405 }
5032 4406
5033 /* 4407 /*
@@ -5036,15 +4410,7 @@ void cgroup_post_fork(struct task_struct *child)
5036 * and addition to css_set. 4410 * and addition to css_set.
5037 */ 4411 */
5038 if (need_forkexit_callback) { 4412 if (need_forkexit_callback) {
5039 /* 4413 for_each_subsys(ss, i)
5040 * fork/exit callbacks are supported only for builtin
5041 * subsystems, and the builtin section of the subsys
5042 * array is immutable, so we don't need to lock the
5043 * subsys array here. On the other hand, modular section
5044 * of the array can be freed at module unload, so we
5045 * can't touch that.
5046 */
5047 for_each_builtin_subsys(ss, i)
5048 if (ss->fork) 4414 if (ss->fork)
5049 ss->fork(child); 4415 ss->fork(child);
5050 } 4416 }
@@ -5053,7 +4419,6 @@ void cgroup_post_fork(struct task_struct *child)
5053/** 4419/**
5054 * cgroup_exit - detach cgroup from exiting task 4420 * cgroup_exit - detach cgroup from exiting task
5055 * @tsk: pointer to task_struct of exiting process 4421 * @tsk: pointer to task_struct of exiting process
5056 * @run_callback: run exit callbacks?
5057 * 4422 *
5058 * Description: Detach cgroup from @tsk and release it. 4423 * Description: Detach cgroup from @tsk and release it.
5059 * 4424 *
@@ -5063,57 +4428,38 @@ void cgroup_post_fork(struct task_struct *child)
5063 * use notify_on_release cgroups where very high task exit scaling 4428 * use notify_on_release cgroups where very high task exit scaling
5064 * is required on large systems. 4429 * is required on large systems.
5065 * 4430 *
5066 * the_top_cgroup_hack: 4431 * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
5067 * 4432 * call cgroup_exit() while the task is still competent to handle
5068 * Set the exiting tasks cgroup to the root cgroup (top_cgroup). 4433 * notify_on_release(), then leave the task attached to the root cgroup in
5069 * 4434 * each hierarchy for the remainder of its exit. No need to bother with
5070 * We call cgroup_exit() while the task is still competent to 4435 * init_css_set refcnting. init_css_set never goes away and we can't race
5071 * handle notify_on_release(), then leave the task attached to the 4436 * with migration path - PF_EXITING is visible to migration path.
5072 * root cgroup in each hierarchy for the remainder of its exit.
5073 *
5074 * To do this properly, we would increment the reference count on
5075 * top_cgroup, and near the very end of the kernel/exit.c do_exit()
5076 * code we would add a second cgroup function call, to drop that
5077 * reference. This would just create an unnecessary hot spot on
5078 * the top_cgroup reference count, to no avail.
5079 *
5080 * Normally, holding a reference to a cgroup without bumping its
5081 * count is unsafe. The cgroup could go away, or someone could
5082 * attach us to a different cgroup, decrementing the count on
5083 * the first cgroup that we never incremented. But in this case,
5084 * top_cgroup isn't going away, and either task has PF_EXITING set,
5085 * which wards off any cgroup_attach_task() attempts, or task is a failed
5086 * fork, never visible to cgroup_attach_task.
5087 */ 4437 */
5088void cgroup_exit(struct task_struct *tsk, int run_callbacks) 4438void cgroup_exit(struct task_struct *tsk)
5089{ 4439{
5090 struct cgroup_subsys *ss; 4440 struct cgroup_subsys *ss;
5091 struct css_set *cset; 4441 struct css_set *cset;
4442 bool put_cset = false;
5092 int i; 4443 int i;
5093 4444
5094 /* 4445 /*
5095 * Unlink from the css_set task list if necessary. 4446 * Unlink from @tsk from its css_set. As migration path can't race
5096 * Optimistically check cg_list before taking 4447 * with us, we can check cg_list without grabbing css_set_rwsem.
5097 * css_set_lock
5098 */ 4448 */
5099 if (!list_empty(&tsk->cg_list)) { 4449 if (!list_empty(&tsk->cg_list)) {
5100 write_lock(&css_set_lock); 4450 down_write(&css_set_rwsem);
5101 if (!list_empty(&tsk->cg_list)) 4451 list_del_init(&tsk->cg_list);
5102 list_del_init(&tsk->cg_list); 4452 up_write(&css_set_rwsem);
5103 write_unlock(&css_set_lock); 4453 put_cset = true;
5104 } 4454 }
5105 4455
5106 /* Reassign the task to the init_css_set. */ 4456 /* Reassign the task to the init_css_set. */
5107 task_lock(tsk);
5108 cset = task_css_set(tsk); 4457 cset = task_css_set(tsk);
5109 RCU_INIT_POINTER(tsk->cgroups, &init_css_set); 4458 RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
5110 4459
5111 if (run_callbacks && need_forkexit_callback) { 4460 if (need_forkexit_callback) {
5112 /* 4461 /* see cgroup_post_fork() for details */
5113 * fork/exit callbacks are supported only for builtin 4462 for_each_subsys(ss, i) {
5114 * subsystems, see cgroup_post_fork() for details.
5115 */
5116 for_each_builtin_subsys(ss, i) {
5117 if (ss->exit) { 4463 if (ss->exit) {
5118 struct cgroup_subsys_state *old_css = cset->subsys[i]; 4464 struct cgroup_subsys_state *old_css = cset->subsys[i];
5119 struct cgroup_subsys_state *css = task_css(tsk, i); 4465 struct cgroup_subsys_state *css = task_css(tsk, i);
@@ -5122,9 +4468,9 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5122 } 4468 }
5123 } 4469 }
5124 } 4470 }
5125 task_unlock(tsk);
5126 4471
5127 put_css_set_taskexit(cset); 4472 if (put_cset)
4473 put_css_set(cset, true);
5128} 4474}
5129 4475
5130static void check_for_release(struct cgroup *cgrp) 4476static void check_for_release(struct cgroup *cgrp)
@@ -5181,16 +4527,17 @@ static void cgroup_release_agent(struct work_struct *work)
5181 while (!list_empty(&release_list)) { 4527 while (!list_empty(&release_list)) {
5182 char *argv[3], *envp[3]; 4528 char *argv[3], *envp[3];
5183 int i; 4529 int i;
5184 char *pathbuf = NULL, *agentbuf = NULL; 4530 char *pathbuf = NULL, *agentbuf = NULL, *path;
5185 struct cgroup *cgrp = list_entry(release_list.next, 4531 struct cgroup *cgrp = list_entry(release_list.next,
5186 struct cgroup, 4532 struct cgroup,
5187 release_list); 4533 release_list);
5188 list_del_init(&cgrp->release_list); 4534 list_del_init(&cgrp->release_list);
5189 raw_spin_unlock(&release_list_lock); 4535 raw_spin_unlock(&release_list_lock);
5190 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4536 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
5191 if (!pathbuf) 4537 if (!pathbuf)
5192 goto continue_free; 4538 goto continue_free;
5193 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) 4539 path = cgroup_path(cgrp, pathbuf, PATH_MAX);
4540 if (!path)
5194 goto continue_free; 4541 goto continue_free;
5195 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); 4542 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
5196 if (!agentbuf) 4543 if (!agentbuf)
@@ -5198,7 +4545,7 @@ static void cgroup_release_agent(struct work_struct *work)
5198 4545
5199 i = 0; 4546 i = 0;
5200 argv[i++] = agentbuf; 4547 argv[i++] = agentbuf;
5201 argv[i++] = pathbuf; 4548 argv[i++] = path;
5202 argv[i] = NULL; 4549 argv[i] = NULL;
5203 4550
5204 i = 0; 4551 i = 0;
@@ -5232,11 +4579,7 @@ static int __init cgroup_disable(char *str)
5232 if (!*token) 4579 if (!*token)
5233 continue; 4580 continue;
5234 4581
5235 /* 4582 for_each_subsys(ss, i) {
5236 * cgroup_disable, being at boot time, can't know about
5237 * module subsystems, so we don't worry about them.
5238 */
5239 for_each_builtin_subsys(ss, i) {
5240 if (!strcmp(token, ss->name)) { 4583 if (!strcmp(token, ss->name)) {
5241 ss->disabled = 1; 4584 ss->disabled = 1;
5242 printk(KERN_INFO "Disabling %s control group" 4585 printk(KERN_INFO "Disabling %s control group"
@@ -5250,28 +4593,42 @@ static int __init cgroup_disable(char *str)
5250__setup("cgroup_disable=", cgroup_disable); 4593__setup("cgroup_disable=", cgroup_disable);
5251 4594
5252/** 4595/**
5253 * css_from_dir - get corresponding css from the dentry of a cgroup dir 4596 * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir
5254 * @dentry: directory dentry of interest 4597 * @dentry: directory dentry of interest
5255 * @ss: subsystem of interest 4598 * @ss: subsystem of interest
5256 * 4599 *
5257 * Must be called under cgroup_mutex or RCU read lock. The caller is 4600 * If @dentry is a directory for a cgroup which has @ss enabled on it, try
5258 * responsible for pinning the returned css if it needs to be accessed 4601 * to get the corresponding css and return it. If such css doesn't exist
5259 * outside the critical section. 4602 * or can't be pinned, an ERR_PTR value is returned.
5260 */ 4603 */
5261struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, 4604struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
5262 struct cgroup_subsys *ss) 4605 struct cgroup_subsys *ss)
5263{ 4606{
4607 struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4608 struct cgroup_subsys_state *css = NULL;
5264 struct cgroup *cgrp; 4609 struct cgroup *cgrp;
5265 4610
5266 cgroup_assert_mutex_or_rcu_locked();
5267
5268 /* is @dentry a cgroup dir? */ 4611 /* is @dentry a cgroup dir? */
5269 if (!dentry->d_inode || 4612 if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
5270 dentry->d_inode->i_op != &cgroup_dir_inode_operations) 4613 kernfs_type(kn) != KERNFS_DIR)
5271 return ERR_PTR(-EBADF); 4614 return ERR_PTR(-EBADF);
5272 4615
5273 cgrp = __d_cgrp(dentry); 4616 rcu_read_lock();
5274 return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); 4617
4618 /*
4619 * This path doesn't originate from kernfs and @kn could already
4620 * have been or be removed at any point. @kn->priv is RCU
4621 * protected for this access. See destroy_locked() for details.
4622 */
4623 cgrp = rcu_dereference(kn->priv);
4624 if (cgrp)
4625 css = cgroup_css(cgrp, ss);
4626
4627 if (!css || !css_tryget(css))
4628 css = ERR_PTR(-ENOENT);
4629
4630 rcu_read_unlock();
4631 return css;
5275} 4632}
5276 4633
5277/** 4634/**
@@ -5286,7 +4643,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5286{ 4643{
5287 struct cgroup *cgrp; 4644 struct cgroup *cgrp;
5288 4645
5289 cgroup_assert_mutex_or_rcu_locked(); 4646 cgroup_assert_mutexes_or_rcu_locked();
5290 4647
5291 cgrp = idr_find(&ss->root->cgroup_idr, id); 4648 cgrp = idr_find(&ss->root->cgroup_idr, id);
5292 if (cgrp) 4649 if (cgrp)
@@ -5338,23 +4695,25 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5338{ 4695{
5339 struct cgrp_cset_link *link; 4696 struct cgrp_cset_link *link;
5340 struct css_set *cset; 4697 struct css_set *cset;
4698 char *name_buf;
5341 4699
5342 read_lock(&css_set_lock); 4700 name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
4701 if (!name_buf)
4702 return -ENOMEM;
4703
4704 down_read(&css_set_rwsem);
5343 rcu_read_lock(); 4705 rcu_read_lock();
5344 cset = rcu_dereference(current->cgroups); 4706 cset = rcu_dereference(current->cgroups);
5345 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { 4707 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
5346 struct cgroup *c = link->cgrp; 4708 struct cgroup *c = link->cgrp;
5347 const char *name;
5348 4709
5349 if (c->dentry) 4710 cgroup_name(c, name_buf, NAME_MAX + 1);
5350 name = c->dentry->d_name.name;
5351 else
5352 name = "?";
5353 seq_printf(seq, "Root %d group %s\n", 4711 seq_printf(seq, "Root %d group %s\n",
5354 c->root->hierarchy_id, name); 4712 c->root->hierarchy_id, name_buf);
5355 } 4713 }
5356 rcu_read_unlock(); 4714 rcu_read_unlock();
5357 read_unlock(&css_set_lock); 4715 up_read(&css_set_rwsem);
4716 kfree(name_buf);
5358 return 0; 4717 return 0;
5359} 4718}
5360 4719
@@ -5364,23 +4723,30 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
5364 struct cgroup_subsys_state *css = seq_css(seq); 4723 struct cgroup_subsys_state *css = seq_css(seq);
5365 struct cgrp_cset_link *link; 4724 struct cgrp_cset_link *link;
5366 4725
5367 read_lock(&css_set_lock); 4726 down_read(&css_set_rwsem);
5368 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { 4727 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5369 struct css_set *cset = link->cset; 4728 struct css_set *cset = link->cset;
5370 struct task_struct *task; 4729 struct task_struct *task;
5371 int count = 0; 4730 int count = 0;
4731
5372 seq_printf(seq, "css_set %p\n", cset); 4732 seq_printf(seq, "css_set %p\n", cset);
4733
5373 list_for_each_entry(task, &cset->tasks, cg_list) { 4734 list_for_each_entry(task, &cset->tasks, cg_list) {
5374 if (count++ > MAX_TASKS_SHOWN_PER_CSS) { 4735 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
5375 seq_puts(seq, " ...\n"); 4736 goto overflow;
5376 break; 4737 seq_printf(seq, " task %d\n", task_pid_vnr(task));
5377 } else { 4738 }
5378 seq_printf(seq, " task %d\n", 4739
5379 task_pid_vnr(task)); 4740 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
5380 } 4741 if (count++ > MAX_TASKS_SHOWN_PER_CSS)
4742 goto overflow;
4743 seq_printf(seq, " task %d\n", task_pid_vnr(task));
5381 } 4744 }
4745 continue;
4746 overflow:
4747 seq_puts(seq, " ...\n");
5382 } 4748 }
5383 read_unlock(&css_set_lock); 4749 up_read(&css_set_rwsem);
5384 return 0; 4750 return 0;
5385} 4751}
5386 4752
@@ -5423,11 +4789,9 @@ static struct cftype debug_files[] = {
5423 { } /* terminate */ 4789 { } /* terminate */
5424}; 4790};
5425 4791
5426struct cgroup_subsys debug_subsys = { 4792struct cgroup_subsys debug_cgrp_subsys = {
5427 .name = "debug",
5428 .css_alloc = debug_css_alloc, 4793 .css_alloc = debug_css_alloc,
5429 .css_free = debug_css_free, 4794 .css_free = debug_css_free,
5430 .subsys_id = debug_subsys_id,
5431 .base_cftypes = debug_files, 4795 .base_cftypes = debug_files,
5432}; 4796};
5433#endif /* CONFIG_CGROUP_DEBUG */ 4797#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 6c3154e477f6..2bc4a2256444 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -52,7 +52,7 @@ static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
52 52
53static inline struct freezer *task_freezer(struct task_struct *task) 53static inline struct freezer *task_freezer(struct task_struct *task)
54{ 54{
55 return css_freezer(task_css(task, freezer_subsys_id)); 55 return css_freezer(task_css(task, freezer_cgrp_id));
56} 56}
57 57
58static struct freezer *parent_freezer(struct freezer *freezer) 58static struct freezer *parent_freezer(struct freezer *freezer)
@@ -84,8 +84,6 @@ static const char *freezer_state_strs(unsigned int state)
84 return "THAWED"; 84 return "THAWED";
85}; 85};
86 86
87struct cgroup_subsys freezer_subsys;
88
89static struct cgroup_subsys_state * 87static struct cgroup_subsys_state *
90freezer_css_alloc(struct cgroup_subsys_state *parent_css) 88freezer_css_alloc(struct cgroup_subsys_state *parent_css)
91{ 89{
@@ -189,7 +187,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
189 * current state before executing the following - !frozen tasks may 187 * current state before executing the following - !frozen tasks may
190 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. 188 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
191 */ 189 */
192 cgroup_taskset_for_each(task, new_css, tset) { 190 cgroup_taskset_for_each(task, tset) {
193 if (!(freezer->state & CGROUP_FREEZING)) { 191 if (!(freezer->state & CGROUP_FREEZING)) {
194 __thaw_task(task); 192 __thaw_task(task);
195 } else { 193 } else {
@@ -216,6 +214,16 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
216 } 214 }
217} 215}
218 216
217/**
218 * freezer_fork - cgroup post fork callback
219 * @task: a task which has just been forked
220 *
221 * @task has just been created and should conform to the current state of
222 * the cgroup_freezer it belongs to. This function may race against
223 * freezer_attach(). Losing to freezer_attach() means that we don't have
224 * to do anything as freezer_attach() will put @task into the appropriate
225 * state.
226 */
219static void freezer_fork(struct task_struct *task) 227static void freezer_fork(struct task_struct *task)
220{ 228{
221 struct freezer *freezer; 229 struct freezer *freezer;
@@ -224,14 +232,26 @@ static void freezer_fork(struct task_struct *task)
224 freezer = task_freezer(task); 232 freezer = task_freezer(task);
225 233
226 /* 234 /*
227 * The root cgroup is non-freezable, so we can skip the 235 * The root cgroup is non-freezable, so we can skip locking the
228 * following check. 236 * freezer. This is safe regardless of race with task migration.
237 * If we didn't race or won, skipping is obviously the right thing
238 * to do. If we lost and root is the new cgroup, noop is still the
239 * right thing to do.
229 */ 240 */
230 if (!parent_freezer(freezer)) 241 if (!parent_freezer(freezer))
231 goto out; 242 goto out;
232 243
244 /*
245 * Grab @freezer->lock and freeze @task after verifying @task still
246 * belongs to @freezer and it's freezing. The former is for the
247 * case where we have raced against task migration and lost and
248 * @task is already in a different cgroup which may not be frozen.
249 * This isn't strictly necessary as freeze_task() is allowed to be
250 * called spuriously but let's do it anyway for, if nothing else,
251 * documentation.
252 */
233 spin_lock_irq(&freezer->lock); 253 spin_lock_irq(&freezer->lock);
234 if (freezer->state & CGROUP_FREEZING) 254 if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING))
235 freeze_task(task); 255 freeze_task(task);
236 spin_unlock_irq(&freezer->lock); 256 spin_unlock_irq(&freezer->lock);
237out: 257out:
@@ -422,7 +442,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
422} 442}
423 443
424static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, 444static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
425 const char *buffer) 445 char *buffer)
426{ 446{
427 bool freeze; 447 bool freeze;
428 448
@@ -473,13 +493,11 @@ static struct cftype files[] = {
473 { } /* terminate */ 493 { } /* terminate */
474}; 494};
475 495
476struct cgroup_subsys freezer_subsys = { 496struct cgroup_subsys freezer_cgrp_subsys = {
477 .name = "freezer",
478 .css_alloc = freezer_css_alloc, 497 .css_alloc = freezer_css_alloc,
479 .css_online = freezer_css_online, 498 .css_online = freezer_css_online,
480 .css_offline = freezer_css_offline, 499 .css_offline = freezer_css_offline,
481 .css_free = freezer_css_free, 500 .css_free = freezer_css_free,
482 .subsys_id = freezer_subsys_id,
483 .attach = freezer_attach, 501 .attach = freezer_attach,
484 .fork = freezer_fork, 502 .fork = freezer_fork,
485 .base_cftypes = files, 503 .base_cftypes = files,
diff --git a/kernel/compat.c b/kernel/compat.c
index 0a09e481b70b..e40b0430b562 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -30,28 +30,6 @@
30 30
31#include <asm/uaccess.h> 31#include <asm/uaccess.h>
32 32
33/*
34 * Get/set struct timeval with struct timespec on the native side
35 */
36static int compat_get_timeval_convert(struct timespec *o,
37 struct compat_timeval __user *i)
38{
39 long usec;
40
41 if (get_user(o->tv_sec, &i->tv_sec) ||
42 get_user(usec, &i->tv_usec))
43 return -EFAULT;
44 o->tv_nsec = usec * 1000;
45 return 0;
46}
47
48static int compat_put_timeval_convert(struct compat_timeval __user *o,
49 struct timeval *i)
50{
51 return (put_user(i->tv_sec, &o->tv_sec) ||
52 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
53}
54
55static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) 33static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
56{ 34{
57 memset(txc, 0, sizeof(struct timex)); 35 memset(txc, 0, sizeof(struct timex));
@@ -110,13 +88,13 @@ static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
110 return 0; 88 return 0;
111} 89}
112 90
113asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, 91COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
114 struct timezone __user *tz) 92 struct timezone __user *, tz)
115{ 93{
116 if (tv) { 94 if (tv) {
117 struct timeval ktv; 95 struct timeval ktv;
118 do_gettimeofday(&ktv); 96 do_gettimeofday(&ktv);
119 if (compat_put_timeval_convert(tv, &ktv)) 97 if (compat_put_timeval(&ktv, tv))
120 return -EFAULT; 98 return -EFAULT;
121 } 99 }
122 if (tz) { 100 if (tz) {
@@ -127,62 +105,61 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
127 return 0; 105 return 0;
128} 106}
129 107
130asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, 108COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
131 struct timezone __user *tz) 109 struct timezone __user *, tz)
132{ 110{
133 struct timespec kts; 111 struct timeval user_tv;
134 struct timezone ktz; 112 struct timespec new_ts;
113 struct timezone new_tz;
135 114
136 if (tv) { 115 if (tv) {
137 if (compat_get_timeval_convert(&kts, tv)) 116 if (compat_get_timeval(&user_tv, tv))
138 return -EFAULT; 117 return -EFAULT;
118 new_ts.tv_sec = user_tv.tv_sec;
119 new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
139 } 120 }
140 if (tz) { 121 if (tz) {
141 if (copy_from_user(&ktz, tz, sizeof(ktz))) 122 if (copy_from_user(&new_tz, tz, sizeof(*tz)))
142 return -EFAULT; 123 return -EFAULT;
143 } 124 }
144 125
145 return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); 126 return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
146} 127}
147 128
148int get_compat_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) 129static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
149{ 130{
150 return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) || 131 return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) ||
151 __get_user(tv->tv_sec, &ctv->tv_sec) || 132 __get_user(tv->tv_sec, &ctv->tv_sec) ||
152 __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; 133 __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
153} 134}
154EXPORT_SYMBOL_GPL(get_compat_timeval);
155 135
156int put_compat_timeval(const struct timeval *tv, struct compat_timeval __user *ctv) 136static int __compat_put_timeval(const struct timeval *tv, struct compat_timeval __user *ctv)
157{ 137{
158 return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) || 138 return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) ||
159 __put_user(tv->tv_sec, &ctv->tv_sec) || 139 __put_user(tv->tv_sec, &ctv->tv_sec) ||
160 __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; 140 __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
161} 141}
162EXPORT_SYMBOL_GPL(put_compat_timeval);
163 142
164int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) 143static int __compat_get_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
165{ 144{
166 return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || 145 return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
167 __get_user(ts->tv_sec, &cts->tv_sec) || 146 __get_user(ts->tv_sec, &cts->tv_sec) ||
168 __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; 147 __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
169} 148}
170EXPORT_SYMBOL_GPL(get_compat_timespec);
171 149
172int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts) 150static int __compat_put_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
173{ 151{
174 return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) || 152 return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) ||
175 __put_user(ts->tv_sec, &cts->tv_sec) || 153 __put_user(ts->tv_sec, &cts->tv_sec) ||
176 __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; 154 __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
177} 155}
178EXPORT_SYMBOL_GPL(put_compat_timespec);
179 156
180int compat_get_timeval(struct timeval *tv, const void __user *utv) 157int compat_get_timeval(struct timeval *tv, const void __user *utv)
181{ 158{
182 if (COMPAT_USE_64BIT_TIME) 159 if (COMPAT_USE_64BIT_TIME)
183 return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; 160 return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0;
184 else 161 else
185 return get_compat_timeval(tv, utv); 162 return __compat_get_timeval(tv, utv);
186} 163}
187EXPORT_SYMBOL_GPL(compat_get_timeval); 164EXPORT_SYMBOL_GPL(compat_get_timeval);
188 165
@@ -191,7 +168,7 @@ int compat_put_timeval(const struct timeval *tv, void __user *utv)
191 if (COMPAT_USE_64BIT_TIME) 168 if (COMPAT_USE_64BIT_TIME)
192 return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; 169 return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0;
193 else 170 else
194 return put_compat_timeval(tv, utv); 171 return __compat_put_timeval(tv, utv);
195} 172}
196EXPORT_SYMBOL_GPL(compat_put_timeval); 173EXPORT_SYMBOL_GPL(compat_put_timeval);
197 174
@@ -200,7 +177,7 @@ int compat_get_timespec(struct timespec *ts, const void __user *uts)
200 if (COMPAT_USE_64BIT_TIME) 177 if (COMPAT_USE_64BIT_TIME)
201 return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; 178 return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0;
202 else 179 else
203 return get_compat_timespec(ts, uts); 180 return __compat_get_timespec(ts, uts);
204} 181}
205EXPORT_SYMBOL_GPL(compat_get_timespec); 182EXPORT_SYMBOL_GPL(compat_get_timespec);
206 183
@@ -209,10 +186,33 @@ int compat_put_timespec(const struct timespec *ts, void __user *uts)
209 if (COMPAT_USE_64BIT_TIME) 186 if (COMPAT_USE_64BIT_TIME)
210 return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; 187 return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0;
211 else 188 else
212 return put_compat_timespec(ts, uts); 189 return __compat_put_timespec(ts, uts);
213} 190}
214EXPORT_SYMBOL_GPL(compat_put_timespec); 191EXPORT_SYMBOL_GPL(compat_put_timespec);
215 192
193int compat_convert_timespec(struct timespec __user **kts,
194 const void __user *cts)
195{
196 struct timespec ts;
197 struct timespec __user *uts;
198
199 if (!cts || COMPAT_USE_64BIT_TIME) {
200 *kts = (struct timespec __user *)cts;
201 return 0;
202 }
203
204 uts = compat_alloc_user_space(sizeof(ts));
205 if (!uts)
206 return -EFAULT;
207 if (compat_get_timespec(&ts, cts))
208 return -EFAULT;
209 if (copy_to_user(uts, &ts, sizeof(ts)))
210 return -EFAULT;
211
212 *kts = uts;
213 return 0;
214}
215
216static long compat_nanosleep_restart(struct restart_block *restart) 216static long compat_nanosleep_restart(struct restart_block *restart)
217{ 217{
218 struct compat_timespec __user *rmtp; 218 struct compat_timespec __user *rmtp;
@@ -229,21 +229,21 @@ static long compat_nanosleep_restart(struct restart_block *restart)
229 if (ret) { 229 if (ret) {
230 rmtp = restart->nanosleep.compat_rmtp; 230 rmtp = restart->nanosleep.compat_rmtp;
231 231
232 if (rmtp && put_compat_timespec(&rmt, rmtp)) 232 if (rmtp && compat_put_timespec(&rmt, rmtp))
233 return -EFAULT; 233 return -EFAULT;
234 } 234 }
235 235
236 return ret; 236 return ret;
237} 237}
238 238
239asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, 239COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
240 struct compat_timespec __user *rmtp) 240 struct compat_timespec __user *, rmtp)
241{ 241{
242 struct timespec tu, rmt; 242 struct timespec tu, rmt;
243 mm_segment_t oldfs; 243 mm_segment_t oldfs;
244 long ret; 244 long ret;
245 245
246 if (get_compat_timespec(&tu, rqtp)) 246 if (compat_get_timespec(&tu, rqtp))
247 return -EFAULT; 247 return -EFAULT;
248 248
249 if (!timespec_valid(&tu)) 249 if (!timespec_valid(&tu))
@@ -263,7 +263,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
263 restart->fn = compat_nanosleep_restart; 263 restart->fn = compat_nanosleep_restart;
264 restart->nanosleep.compat_rmtp = rmtp; 264 restart->nanosleep.compat_rmtp = rmtp;
265 265
266 if (rmtp && put_compat_timespec(&rmt, rmtp)) 266 if (rmtp && compat_put_timespec(&rmt, rmtp))
267 return -EFAULT; 267 return -EFAULT;
268 } 268 }
269 269
@@ -328,7 +328,7 @@ static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
328 return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); 328 return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
329} 329}
330 330
331asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) 331COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
332{ 332{
333 if (tbuf) { 333 if (tbuf) {
334 struct tms tms; 334 struct tms tms;
@@ -354,7 +354,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
354 * types that can be passed to put_user()/get_user(). 354 * types that can be passed to put_user()/get_user().
355 */ 355 */
356 356
357asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) 357COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set)
358{ 358{
359 old_sigset_t s; 359 old_sigset_t s;
360 long ret; 360 long ret;
@@ -424,8 +424,8 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
424 424
425#endif 425#endif
426 426
427asmlinkage long compat_sys_setrlimit(unsigned int resource, 427COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
428 struct compat_rlimit __user *rlim) 428 struct compat_rlimit __user *, rlim)
429{ 429{
430 struct rlimit r; 430 struct rlimit r;
431 431
@@ -443,15 +443,15 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
443 443
444#ifdef COMPAT_RLIM_OLD_INFINITY 444#ifdef COMPAT_RLIM_OLD_INFINITY
445 445
446asmlinkage long compat_sys_old_getrlimit(unsigned int resource, 446COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
447 struct compat_rlimit __user *rlim) 447 struct compat_rlimit __user *, rlim)
448{ 448{
449 struct rlimit r; 449 struct rlimit r;
450 int ret; 450 int ret;
451 mm_segment_t old_fs = get_fs(); 451 mm_segment_t old_fs = get_fs();
452 452
453 set_fs(KERNEL_DS); 453 set_fs(KERNEL_DS);
454 ret = sys_old_getrlimit(resource, &r); 454 ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r);
455 set_fs(old_fs); 455 set_fs(old_fs);
456 456
457 if (!ret) { 457 if (!ret) {
@@ -470,8 +470,8 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
470 470
471#endif 471#endif
472 472
473asmlinkage long compat_sys_getrlimit(unsigned int resource, 473COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
474 struct compat_rlimit __user *rlim) 474 struct compat_rlimit __user *, rlim)
475{ 475{
476 struct rlimit r; 476 struct rlimit r;
477 int ret; 477 int ret;
@@ -596,9 +596,9 @@ static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
596 return compat_get_bitmap(k, user_mask_ptr, len * 8); 596 return compat_get_bitmap(k, user_mask_ptr, len * 8);
597} 597}
598 598
599asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, 599COMPAT_SYSCALL_DEFINE3(sched_setaffinity, compat_pid_t, pid,
600 unsigned int len, 600 unsigned int, len,
601 compat_ulong_t __user *user_mask_ptr) 601 compat_ulong_t __user *, user_mask_ptr)
602{ 602{
603 cpumask_var_t new_mask; 603 cpumask_var_t new_mask;
604 int retval; 604 int retval;
@@ -616,8 +616,8 @@ out:
616 return retval; 616 return retval;
617} 617}
618 618
619asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, 619COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
620 compat_ulong_t __user *user_mask_ptr) 620 compat_ulong_t __user *, user_mask_ptr)
621{ 621{
622 int ret; 622 int ret;
623 cpumask_var_t mask; 623 cpumask_var_t mask;
@@ -647,8 +647,8 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
647int get_compat_itimerspec(struct itimerspec *dst, 647int get_compat_itimerspec(struct itimerspec *dst,
648 const struct compat_itimerspec __user *src) 648 const struct compat_itimerspec __user *src)
649{ 649{
650 if (get_compat_timespec(&dst->it_interval, &src->it_interval) || 650 if (__compat_get_timespec(&dst->it_interval, &src->it_interval) ||
651 get_compat_timespec(&dst->it_value, &src->it_value)) 651 __compat_get_timespec(&dst->it_value, &src->it_value))
652 return -EFAULT; 652 return -EFAULT;
653 return 0; 653 return 0;
654} 654}
@@ -656,15 +656,15 @@ int get_compat_itimerspec(struct itimerspec *dst,
656int put_compat_itimerspec(struct compat_itimerspec __user *dst, 656int put_compat_itimerspec(struct compat_itimerspec __user *dst,
657 const struct itimerspec *src) 657 const struct itimerspec *src)
658{ 658{
659 if (put_compat_timespec(&src->it_interval, &dst->it_interval) || 659 if (__compat_put_timespec(&src->it_interval, &dst->it_interval) ||
660 put_compat_timespec(&src->it_value, &dst->it_value)) 660 __compat_put_timespec(&src->it_value, &dst->it_value))
661 return -EFAULT; 661 return -EFAULT;
662 return 0; 662 return 0;
663} 663}
664 664
665long compat_sys_timer_create(clockid_t which_clock, 665COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
666 struct compat_sigevent __user *timer_event_spec, 666 struct compat_sigevent __user *, timer_event_spec,
667 timer_t __user *created_timer_id) 667 timer_t __user *, created_timer_id)
668{ 668{
669 struct sigevent __user *event = NULL; 669 struct sigevent __user *event = NULL;
670 670
@@ -680,9 +680,9 @@ long compat_sys_timer_create(clockid_t which_clock,
680 return sys_timer_create(which_clock, event, created_timer_id); 680 return sys_timer_create(which_clock, event, created_timer_id);
681} 681}
682 682
683long compat_sys_timer_settime(timer_t timer_id, int flags, 683COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
684 struct compat_itimerspec __user *new, 684 struct compat_itimerspec __user *, new,
685 struct compat_itimerspec __user *old) 685 struct compat_itimerspec __user *, old)
686{ 686{
687 long err; 687 long err;
688 mm_segment_t oldfs; 688 mm_segment_t oldfs;
@@ -703,8 +703,8 @@ long compat_sys_timer_settime(timer_t timer_id, int flags,
703 return err; 703 return err;
704} 704}
705 705
706long compat_sys_timer_gettime(timer_t timer_id, 706COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
707 struct compat_itimerspec __user *setting) 707 struct compat_itimerspec __user *, setting)
708{ 708{
709 long err; 709 long err;
710 mm_segment_t oldfs; 710 mm_segment_t oldfs;
@@ -720,14 +720,14 @@ long compat_sys_timer_gettime(timer_t timer_id,
720 return err; 720 return err;
721} 721}
722 722
723long compat_sys_clock_settime(clockid_t which_clock, 723COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
724 struct compat_timespec __user *tp) 724 struct compat_timespec __user *, tp)
725{ 725{
726 long err; 726 long err;
727 mm_segment_t oldfs; 727 mm_segment_t oldfs;
728 struct timespec ts; 728 struct timespec ts;
729 729
730 if (get_compat_timespec(&ts, tp)) 730 if (compat_get_timespec(&ts, tp))
731 return -EFAULT; 731 return -EFAULT;
732 oldfs = get_fs(); 732 oldfs = get_fs();
733 set_fs(KERNEL_DS); 733 set_fs(KERNEL_DS);
@@ -737,8 +737,8 @@ long compat_sys_clock_settime(clockid_t which_clock,
737 return err; 737 return err;
738} 738}
739 739
740long compat_sys_clock_gettime(clockid_t which_clock, 740COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
741 struct compat_timespec __user *tp) 741 struct compat_timespec __user *, tp)
742{ 742{
743 long err; 743 long err;
744 mm_segment_t oldfs; 744 mm_segment_t oldfs;
@@ -749,13 +749,13 @@ long compat_sys_clock_gettime(clockid_t which_clock,
749 err = sys_clock_gettime(which_clock, 749 err = sys_clock_gettime(which_clock,
750 (struct timespec __user *) &ts); 750 (struct timespec __user *) &ts);
751 set_fs(oldfs); 751 set_fs(oldfs);
752 if (!err && put_compat_timespec(&ts, tp)) 752 if (!err && compat_put_timespec(&ts, tp))
753 return -EFAULT; 753 return -EFAULT;
754 return err; 754 return err;
755} 755}
756 756
757long compat_sys_clock_adjtime(clockid_t which_clock, 757COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
758 struct compat_timex __user *utp) 758 struct compat_timex __user *, utp)
759{ 759{
760 struct timex txc; 760 struct timex txc;
761 mm_segment_t oldfs; 761 mm_segment_t oldfs;
@@ -777,8 +777,8 @@ long compat_sys_clock_adjtime(clockid_t which_clock,
777 return ret; 777 return ret;
778} 778}
779 779
780long compat_sys_clock_getres(clockid_t which_clock, 780COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
781 struct compat_timespec __user *tp) 781 struct compat_timespec __user *, tp)
782{ 782{
783 long err; 783 long err;
784 mm_segment_t oldfs; 784 mm_segment_t oldfs;
@@ -789,7 +789,7 @@ long compat_sys_clock_getres(clockid_t which_clock,
789 err = sys_clock_getres(which_clock, 789 err = sys_clock_getres(which_clock,
790 (struct timespec __user *) &ts); 790 (struct timespec __user *) &ts);
791 set_fs(oldfs); 791 set_fs(oldfs);
792 if (!err && tp && put_compat_timespec(&ts, tp)) 792 if (!err && tp && compat_put_timespec(&ts, tp))
793 return -EFAULT; 793 return -EFAULT;
794 return err; 794 return err;
795} 795}
@@ -799,7 +799,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
799 long err; 799 long err;
800 mm_segment_t oldfs; 800 mm_segment_t oldfs;
801 struct timespec tu; 801 struct timespec tu;
802 struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp; 802 struct compat_timespec __user *rmtp = restart->nanosleep.compat_rmtp;
803 803
804 restart->nanosleep.rmtp = (struct timespec __user *) &tu; 804 restart->nanosleep.rmtp = (struct timespec __user *) &tu;
805 oldfs = get_fs(); 805 oldfs = get_fs();
@@ -808,7 +808,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
808 set_fs(oldfs); 808 set_fs(oldfs);
809 809
810 if ((err == -ERESTART_RESTARTBLOCK) && rmtp && 810 if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
811 put_compat_timespec(&tu, rmtp)) 811 compat_put_timespec(&tu, rmtp))
812 return -EFAULT; 812 return -EFAULT;
813 813
814 if (err == -ERESTART_RESTARTBLOCK) { 814 if (err == -ERESTART_RESTARTBLOCK) {
@@ -818,16 +818,16 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
818 return err; 818 return err;
819} 819}
820 820
821long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, 821COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
822 struct compat_timespec __user *rqtp, 822 struct compat_timespec __user *, rqtp,
823 struct compat_timespec __user *rmtp) 823 struct compat_timespec __user *, rmtp)
824{ 824{
825 long err; 825 long err;
826 mm_segment_t oldfs; 826 mm_segment_t oldfs;
827 struct timespec in, out; 827 struct timespec in, out;
828 struct restart_block *restart; 828 struct restart_block *restart;
829 829
830 if (get_compat_timespec(&in, rqtp)) 830 if (compat_get_timespec(&in, rqtp))
831 return -EFAULT; 831 return -EFAULT;
832 832
833 oldfs = get_fs(); 833 oldfs = get_fs();
@@ -838,7 +838,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
838 set_fs(oldfs); 838 set_fs(oldfs);
839 839
840 if ((err == -ERESTART_RESTARTBLOCK) && rmtp && 840 if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
841 put_compat_timespec(&out, rmtp)) 841 compat_put_timespec(&out, rmtp))
842 return -EFAULT; 842 return -EFAULT;
843 843
844 if (err == -ERESTART_RESTARTBLOCK) { 844 if (err == -ERESTART_RESTARTBLOCK) {
@@ -1010,7 +1010,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
1010 1010
1011/* compat_time_t is a 32 bit "long" and needs to get converted. */ 1011/* compat_time_t is a 32 bit "long" and needs to get converted. */
1012 1012
1013asmlinkage long compat_sys_time(compat_time_t __user * tloc) 1013COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc)
1014{ 1014{
1015 compat_time_t i; 1015 compat_time_t i;
1016 struct timeval tv; 1016 struct timeval tv;
@@ -1026,7 +1026,7 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc)
1026 return i; 1026 return i;
1027} 1027}
1028 1028
1029asmlinkage long compat_sys_stime(compat_time_t __user *tptr) 1029COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
1030{ 1030{
1031 struct timespec tv; 1031 struct timespec tv;
1032 int err; 1032 int err;
@@ -1046,7 +1046,7 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
1046 1046
1047#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ 1047#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
1048 1048
1049asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) 1049COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
1050{ 1050{
1051 struct timex txc; 1051 struct timex txc;
1052 int err, ret; 1052 int err, ret;
@@ -1065,11 +1065,11 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
1065} 1065}
1066 1066
1067#ifdef CONFIG_NUMA 1067#ifdef CONFIG_NUMA
1068asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, 1068COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
1069 compat_uptr_t __user *pages32, 1069 compat_uptr_t __user *, pages32,
1070 const int __user *nodes, 1070 const int __user *, nodes,
1071 int __user *status, 1071 int __user *, status,
1072 int flags) 1072 int, flags)
1073{ 1073{
1074 const void __user * __user *pages; 1074 const void __user * __user *pages;
1075 int i; 1075 int i;
@@ -1085,10 +1085,10 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
1085 return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); 1085 return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
1086} 1086}
1087 1087
1088asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, 1088COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1089 compat_ulong_t maxnode, 1089 compat_ulong_t, maxnode,
1090 const compat_ulong_t __user *old_nodes, 1090 const compat_ulong_t __user *, old_nodes,
1091 const compat_ulong_t __user *new_nodes) 1091 const compat_ulong_t __user *, new_nodes)
1092{ 1092{
1093 unsigned long __user *old = NULL; 1093 unsigned long __user *old = NULL;
1094 unsigned long __user *new = NULL; 1094 unsigned long __user *new = NULL;
@@ -1130,7 +1130,7 @@ COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
1130 set_fs(KERNEL_DS); 1130 set_fs(KERNEL_DS);
1131 ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); 1131 ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
1132 set_fs(old_fs); 1132 set_fs(old_fs);
1133 if (put_compat_timespec(&t, interval)) 1133 if (compat_put_timespec(&t, interval))
1134 return -EFAULT; 1134 return -EFAULT;
1135 return ret; 1135 return ret;
1136} 1136}
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile
deleted file mode 100644
index 59ab052ef7a0..000000000000
--- a/kernel/cpu/Makefile
+++ /dev/null
@@ -1 +0,0 @@
1obj-y = idle.o
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
deleted file mode 100644
index 277f494c2a9a..000000000000
--- a/kernel/cpu/idle.c
+++ /dev/null
@@ -1,144 +0,0 @@
1/*
2 * Generic entry point for the idle threads
3 */
4#include <linux/sched.h>
5#include <linux/cpu.h>
6#include <linux/tick.h>
7#include <linux/mm.h>
8#include <linux/stackprotector.h>
9
10#include <asm/tlb.h>
11
12#include <trace/events/power.h>
13
14static int __read_mostly cpu_idle_force_poll;
15
16void cpu_idle_poll_ctrl(bool enable)
17{
18 if (enable) {
19 cpu_idle_force_poll++;
20 } else {
21 cpu_idle_force_poll--;
22 WARN_ON_ONCE(cpu_idle_force_poll < 0);
23 }
24}
25
26#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
27static int __init cpu_idle_poll_setup(char *__unused)
28{
29 cpu_idle_force_poll = 1;
30 return 1;
31}
32__setup("nohlt", cpu_idle_poll_setup);
33
34static int __init cpu_idle_nopoll_setup(char *__unused)
35{
36 cpu_idle_force_poll = 0;
37 return 1;
38}
39__setup("hlt", cpu_idle_nopoll_setup);
40#endif
41
42static inline int cpu_idle_poll(void)
43{
44 rcu_idle_enter();
45 trace_cpu_idle_rcuidle(0, smp_processor_id());
46 local_irq_enable();
47 while (!tif_need_resched())
48 cpu_relax();
49 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
50 rcu_idle_exit();
51 return 1;
52}
53
54/* Weak implementations for optional arch specific functions */
55void __weak arch_cpu_idle_prepare(void) { }
56void __weak arch_cpu_idle_enter(void) { }
57void __weak arch_cpu_idle_exit(void) { }
58void __weak arch_cpu_idle_dead(void) { }
59void __weak arch_cpu_idle(void)
60{
61 cpu_idle_force_poll = 1;
62 local_irq_enable();
63}
64
65/*
66 * Generic idle loop implementation
67 */
68static void cpu_idle_loop(void)
69{
70 while (1) {
71 tick_nohz_idle_enter();
72
73 while (!need_resched()) {
74 check_pgt_cache();
75 rmb();
76
77 if (cpu_is_offline(smp_processor_id()))
78 arch_cpu_idle_dead();
79
80 local_irq_disable();
81 arch_cpu_idle_enter();
82
83 /*
84 * In poll mode we reenable interrupts and spin.
85 *
86 * Also if we detected in the wakeup from idle
87 * path that the tick broadcast device expired
88 * for us, we don't want to go deep idle as we
89 * know that the IPI is going to arrive right
90 * away
91 */
92 if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
93 cpu_idle_poll();
94 } else {
95 if (!current_clr_polling_and_test()) {
96 stop_critical_timings();
97 rcu_idle_enter();
98 arch_cpu_idle();
99 WARN_ON_ONCE(irqs_disabled());
100 rcu_idle_exit();
101 start_critical_timings();
102 } else {
103 local_irq_enable();
104 }
105 __current_set_polling();
106 }
107 arch_cpu_idle_exit();
108 }
109
110 /*
111 * Since we fell out of the loop above, we know
112 * TIF_NEED_RESCHED must be set, propagate it into
113 * PREEMPT_NEED_RESCHED.
114 *
115 * This is required because for polling idle loops we will
116 * not have had an IPI to fold the state for us.
117 */
118 preempt_set_need_resched();
119 tick_nohz_idle_exit();
120 schedule_preempt_disabled();
121 }
122}
123
124void cpu_startup_entry(enum cpuhp_state state)
125{
126 /*
127 * This #ifdef needs to die, but it's too late in the cycle to
128 * make this generic (arm and sh have never invoked the canary
129 * init for the non boot cpus!). Will be fixed in 3.11
130 */
131#ifdef CONFIG_X86
132 /*
133 * If we're the non-boot CPU, nothing set the stack canary up
134 * for us. The boot CPU already has it initialized but no harm
135 * in doing it again. This is a good place for updating it, as
136 * we wont ever return from this function (so the invalid
137 * canaries already on the stack wont ever trigger).
138 */
139 boot_init_stack_canary();
140#endif
141 __current_set_polling();
142 arch_cpu_idle_prepare();
143 cpu_idle_loop();
144}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e6b1b66afe52..3d54c418bd06 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -119,7 +119,7 @@ static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
119/* Retrieve the cpuset for a task */ 119/* Retrieve the cpuset for a task */
120static inline struct cpuset *task_cs(struct task_struct *task) 120static inline struct cpuset *task_cs(struct task_struct *task)
121{ 121{
122 return css_cs(task_css(task, cpuset_subsys_id)); 122 return css_cs(task_css(task, cpuset_cgrp_id));
123} 123}
124 124
125static inline struct cpuset *parent_cs(struct cpuset *cs) 125static inline struct cpuset *parent_cs(struct cpuset *cs)
@@ -467,7 +467,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
467 * be changed to have empty cpus_allowed or mems_allowed. 467 * be changed to have empty cpus_allowed or mems_allowed.
468 */ 468 */
469 ret = -ENOSPC; 469 ret = -ENOSPC;
470 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) { 470 if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
471 if (!cpumask_empty(cur->cpus_allowed) && 471 if (!cpumask_empty(cur->cpus_allowed) &&
472 cpumask_empty(trial->cpus_allowed)) 472 cpumask_empty(trial->cpus_allowed))
473 goto out; 473 goto out;
@@ -829,55 +829,36 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
829} 829}
830 830
831/** 831/**
832 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
833 * @tsk: task to test
834 * @data: cpuset to @tsk belongs to
835 *
836 * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed
837 * mask needs to be changed.
838 *
839 * We don't need to re-check for the cgroup/cpuset membership, since we're
840 * holding cpuset_mutex at this point.
841 */
842static void cpuset_change_cpumask(struct task_struct *tsk, void *data)
843{
844 struct cpuset *cs = data;
845 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
846
847 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
848}
849
850/**
851 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 832 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
852 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 833 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
853 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
854 *
855 * Called with cpuset_mutex held
856 * 834 *
857 * The css_scan_tasks() function will scan all the tasks in a cgroup, 835 * Iterate through each task of @cs updating its cpus_allowed to the
858 * calling callback functions for each. 836 * effective cpuset's. As this function is called with cpuset_mutex held,
859 * 837 * cpuset membership stays stable.
860 * No return value. It's guaranteed that css_scan_tasks() always returns 0
861 * if @heap != NULL.
862 */ 838 */
863static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) 839static void update_tasks_cpumask(struct cpuset *cs)
864{ 840{
865 css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap); 841 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
842 struct css_task_iter it;
843 struct task_struct *task;
844
845 css_task_iter_start(&cs->css, &it);
846 while ((task = css_task_iter_next(&it)))
847 set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
848 css_task_iter_end(&it);
866} 849}
867 850
868/* 851/*
869 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. 852 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
870 * @root_cs: the root cpuset of the hierarchy 853 * @root_cs: the root cpuset of the hierarchy
871 * @update_root: update root cpuset or not? 854 * @update_root: update root cpuset or not?
872 * @heap: the heap used by css_scan_tasks()
873 * 855 *
874 * This will update cpumasks of tasks in @root_cs and all other empty cpusets 856 * This will update cpumasks of tasks in @root_cs and all other empty cpusets
875 * which take on cpumask of @root_cs. 857 * which take on cpumask of @root_cs.
876 * 858 *
877 * Called with cpuset_mutex held 859 * Called with cpuset_mutex held
878 */ 860 */
879static void update_tasks_cpumask_hier(struct cpuset *root_cs, 861static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
880 bool update_root, struct ptr_heap *heap)
881{ 862{
882 struct cpuset *cp; 863 struct cpuset *cp;
883 struct cgroup_subsys_state *pos_css; 864 struct cgroup_subsys_state *pos_css;
@@ -898,7 +879,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
898 continue; 879 continue;
899 rcu_read_unlock(); 880 rcu_read_unlock();
900 881
901 update_tasks_cpumask(cp, heap); 882 update_tasks_cpumask(cp);
902 883
903 rcu_read_lock(); 884 rcu_read_lock();
904 css_put(&cp->css); 885 css_put(&cp->css);
@@ -914,7 +895,6 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
914static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, 895static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
915 const char *buf) 896 const char *buf)
916{ 897{
917 struct ptr_heap heap;
918 int retval; 898 int retval;
919 int is_load_balanced; 899 int is_load_balanced;
920 900
@@ -947,19 +927,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
947 if (retval < 0) 927 if (retval < 0)
948 return retval; 928 return retval;
949 929
950 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
951 if (retval)
952 return retval;
953
954 is_load_balanced = is_sched_load_balance(trialcs); 930 is_load_balanced = is_sched_load_balance(trialcs);
955 931
956 mutex_lock(&callback_mutex); 932 mutex_lock(&callback_mutex);
957 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 933 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
958 mutex_unlock(&callback_mutex); 934 mutex_unlock(&callback_mutex);
959 935
960 update_tasks_cpumask_hier(cs, true, &heap); 936 update_tasks_cpumask_hier(cs, true);
961
962 heap_free(&heap);
963 937
964 if (is_load_balanced) 938 if (is_load_balanced)
965 rebuild_sched_domains_locked(); 939 rebuild_sched_domains_locked();
@@ -1022,7 +996,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1022 task_lock(tsk); 996 task_lock(tsk);
1023 /* 997 /*
1024 * Determine if a loop is necessary if another thread is doing 998 * Determine if a loop is necessary if another thread is doing
1025 * get_mems_allowed(). If at least one node remains unchanged and 999 * read_mems_allowed_begin(). If at least one node remains unchanged and
1026 * tsk does not have a mempolicy, then an empty nodemask will not be 1000 * tsk does not have a mempolicy, then an empty nodemask will not be
1027 * possible when mems_allowed is larger than a word. 1001 * possible when mems_allowed is larger than a word.
1028 */ 1002 */
@@ -1048,53 +1022,22 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1048 task_unlock(tsk); 1022 task_unlock(tsk);
1049} 1023}
1050 1024
1051struct cpuset_change_nodemask_arg {
1052 struct cpuset *cs;
1053 nodemask_t *newmems;
1054};
1055
1056/*
1057 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
1058 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
1059 * memory_migrate flag is set. Called with cpuset_mutex held.
1060 */
1061static void cpuset_change_nodemask(struct task_struct *p, void *data)
1062{
1063 struct cpuset_change_nodemask_arg *arg = data;
1064 struct cpuset *cs = arg->cs;
1065 struct mm_struct *mm;
1066 int migrate;
1067
1068 cpuset_change_task_nodemask(p, arg->newmems);
1069
1070 mm = get_task_mm(p);
1071 if (!mm)
1072 return;
1073
1074 migrate = is_memory_migrate(cs);
1075
1076 mpol_rebind_mm(mm, &cs->mems_allowed);
1077 if (migrate)
1078 cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems);
1079 mmput(mm);
1080}
1081
1082static void *cpuset_being_rebound; 1025static void *cpuset_being_rebound;
1083 1026
1084/** 1027/**
1085 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1028 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1086 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1029 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1087 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1088 * 1030 *
1089 * Called with cpuset_mutex held. No return value. It's guaranteed that 1031 * Iterate through each task of @cs updating its mems_allowed to the
1090 * css_scan_tasks() always returns 0 if @heap != NULL. 1032 * effective cpuset's. As this function is called with cpuset_mutex held,
1033 * cpuset membership stays stable.
1091 */ 1034 */
1092static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) 1035static void update_tasks_nodemask(struct cpuset *cs)
1093{ 1036{
1094 static nodemask_t newmems; /* protected by cpuset_mutex */ 1037 static nodemask_t newmems; /* protected by cpuset_mutex */
1095 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1038 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1096 struct cpuset_change_nodemask_arg arg = { .cs = cs, 1039 struct css_task_iter it;
1097 .newmems = &newmems }; 1040 struct task_struct *task;
1098 1041
1099 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1042 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1100 1043
@@ -1110,7 +1053,25 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1110 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1053 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1111 * is idempotent. Also migrate pages in each mm to new nodes. 1054 * is idempotent. Also migrate pages in each mm to new nodes.
1112 */ 1055 */
1113 css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap); 1056 css_task_iter_start(&cs->css, &it);
1057 while ((task = css_task_iter_next(&it))) {
1058 struct mm_struct *mm;
1059 bool migrate;
1060
1061 cpuset_change_task_nodemask(task, &newmems);
1062
1063 mm = get_task_mm(task);
1064 if (!mm)
1065 continue;
1066
1067 migrate = is_memory_migrate(cs);
1068
1069 mpol_rebind_mm(mm, &cs->mems_allowed);
1070 if (migrate)
1071 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1072 mmput(mm);
1073 }
1074 css_task_iter_end(&it);
1114 1075
1115 /* 1076 /*
1116 * All the tasks' nodemasks have been updated, update 1077 * All the tasks' nodemasks have been updated, update
@@ -1126,15 +1087,13 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1126 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. 1087 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
1127 * @cs: the root cpuset of the hierarchy 1088 * @cs: the root cpuset of the hierarchy
1128 * @update_root: update the root cpuset or not? 1089 * @update_root: update the root cpuset or not?
1129 * @heap: the heap used by css_scan_tasks()
1130 * 1090 *
1131 * This will update nodemasks of tasks in @root_cs and all other empty cpusets 1091 * This will update nodemasks of tasks in @root_cs and all other empty cpusets
1132 * which take on nodemask of @root_cs. 1092 * which take on nodemask of @root_cs.
1133 * 1093 *
1134 * Called with cpuset_mutex held 1094 * Called with cpuset_mutex held
1135 */ 1095 */
1136static void update_tasks_nodemask_hier(struct cpuset *root_cs, 1096static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
1137 bool update_root, struct ptr_heap *heap)
1138{ 1097{
1139 struct cpuset *cp; 1098 struct cpuset *cp;
1140 struct cgroup_subsys_state *pos_css; 1099 struct cgroup_subsys_state *pos_css;
@@ -1155,7 +1114,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs,
1155 continue; 1114 continue;
1156 rcu_read_unlock(); 1115 rcu_read_unlock();
1157 1116
1158 update_tasks_nodemask(cp, heap); 1117 update_tasks_nodemask(cp);
1159 1118
1160 rcu_read_lock(); 1119 rcu_read_lock();
1161 css_put(&cp->css); 1120 css_put(&cp->css);
@@ -1180,7 +1139,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1180 const char *buf) 1139 const char *buf)
1181{ 1140{
1182 int retval; 1141 int retval;
1183 struct ptr_heap heap;
1184 1142
1185 /* 1143 /*
1186 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; 1144 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
@@ -1219,17 +1177,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1219 if (retval < 0) 1177 if (retval < 0)
1220 goto done; 1178 goto done;
1221 1179
1222 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1223 if (retval < 0)
1224 goto done;
1225
1226 mutex_lock(&callback_mutex); 1180 mutex_lock(&callback_mutex);
1227 cs->mems_allowed = trialcs->mems_allowed; 1181 cs->mems_allowed = trialcs->mems_allowed;
1228 mutex_unlock(&callback_mutex); 1182 mutex_unlock(&callback_mutex);
1229 1183
1230 update_tasks_nodemask_hier(cs, true, &heap); 1184 update_tasks_nodemask_hier(cs, true);
1231
1232 heap_free(&heap);
1233done: 1185done:
1234 return retval; 1186 return retval;
1235} 1187}
@@ -1257,38 +1209,22 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1257} 1209}
1258 1210
1259/** 1211/**
1260 * cpuset_change_flag - make a task's spread flags the same as its cpuset's
1261 * @tsk: task to be updated
1262 * @data: cpuset to @tsk belongs to
1263 *
1264 * Called by css_scan_tasks() for each task in a cgroup.
1265 *
1266 * We don't need to re-check for the cgroup/cpuset membership, since we're
1267 * holding cpuset_mutex at this point.
1268 */
1269static void cpuset_change_flag(struct task_struct *tsk, void *data)
1270{
1271 struct cpuset *cs = data;
1272
1273 cpuset_update_task_spread_flag(cs, tsk);
1274}
1275
1276/**
1277 * update_tasks_flags - update the spread flags of tasks in the cpuset. 1212 * update_tasks_flags - update the spread flags of tasks in the cpuset.
1278 * @cs: the cpuset in which each task's spread flags needs to be changed 1213 * @cs: the cpuset in which each task's spread flags needs to be changed
1279 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1280 *
1281 * Called with cpuset_mutex held
1282 * 1214 *
1283 * The css_scan_tasks() function will scan all the tasks in a cgroup, 1215 * Iterate through each task of @cs updating its spread flags. As this
1284 * calling callback functions for each. 1216 * function is called with cpuset_mutex held, cpuset membership stays
1285 * 1217 * stable.
1286 * No return value. It's guaranteed that css_scan_tasks() always returns 0
1287 * if @heap != NULL.
1288 */ 1218 */
1289static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) 1219static void update_tasks_flags(struct cpuset *cs)
1290{ 1220{
1291 css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap); 1221 struct css_task_iter it;
1222 struct task_struct *task;
1223
1224 css_task_iter_start(&cs->css, &it);
1225 while ((task = css_task_iter_next(&it)))
1226 cpuset_update_task_spread_flag(cs, task);
1227 css_task_iter_end(&it);
1292} 1228}
1293 1229
1294/* 1230/*
@@ -1306,7 +1242,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1306 struct cpuset *trialcs; 1242 struct cpuset *trialcs;
1307 int balance_flag_changed; 1243 int balance_flag_changed;
1308 int spread_flag_changed; 1244 int spread_flag_changed;
1309 struct ptr_heap heap;
1310 int err; 1245 int err;
1311 1246
1312 trialcs = alloc_trial_cpuset(cs); 1247 trialcs = alloc_trial_cpuset(cs);
@@ -1322,10 +1257,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1322 if (err < 0) 1257 if (err < 0)
1323 goto out; 1258 goto out;
1324 1259
1325 err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1326 if (err < 0)
1327 goto out;
1328
1329 balance_flag_changed = (is_sched_load_balance(cs) != 1260 balance_flag_changed = (is_sched_load_balance(cs) !=
1330 is_sched_load_balance(trialcs)); 1261 is_sched_load_balance(trialcs));
1331 1262
@@ -1340,8 +1271,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1340 rebuild_sched_domains_locked(); 1271 rebuild_sched_domains_locked();
1341 1272
1342 if (spread_flag_changed) 1273 if (spread_flag_changed)
1343 update_tasks_flags(cs, &heap); 1274 update_tasks_flags(cs);
1344 heap_free(&heap);
1345out: 1275out:
1346 free_trial_cpuset(trialcs); 1276 free_trial_cpuset(trialcs);
1347 return err; 1277 return err;
@@ -1445,6 +1375,8 @@ static int fmeter_getrate(struct fmeter *fmp)
1445 return val; 1375 return val;
1446} 1376}
1447 1377
1378static struct cpuset *cpuset_attach_old_cs;
1379
1448/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ 1380/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1449static int cpuset_can_attach(struct cgroup_subsys_state *css, 1381static int cpuset_can_attach(struct cgroup_subsys_state *css,
1450 struct cgroup_taskset *tset) 1382 struct cgroup_taskset *tset)
@@ -1453,6 +1385,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1453 struct task_struct *task; 1385 struct task_struct *task;
1454 int ret; 1386 int ret;
1455 1387
1388 /* used later by cpuset_attach() */
1389 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
1390
1456 mutex_lock(&cpuset_mutex); 1391 mutex_lock(&cpuset_mutex);
1457 1392
1458 /* 1393 /*
@@ -1464,7 +1399,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1464 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1399 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1465 goto out_unlock; 1400 goto out_unlock;
1466 1401
1467 cgroup_taskset_for_each(task, css, tset) { 1402 cgroup_taskset_for_each(task, tset) {
1468 /* 1403 /*
1469 * Kthreads which disallow setaffinity shouldn't be moved 1404 * Kthreads which disallow setaffinity shouldn't be moved
1470 * to a new cpuset; we don't want to change their cpu 1405 * to a new cpuset; we don't want to change their cpu
@@ -1516,10 +1451,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1516 struct mm_struct *mm; 1451 struct mm_struct *mm;
1517 struct task_struct *task; 1452 struct task_struct *task;
1518 struct task_struct *leader = cgroup_taskset_first(tset); 1453 struct task_struct *leader = cgroup_taskset_first(tset);
1519 struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset,
1520 cpuset_subsys_id);
1521 struct cpuset *cs = css_cs(css); 1454 struct cpuset *cs = css_cs(css);
1522 struct cpuset *oldcs = css_cs(oldcss); 1455 struct cpuset *oldcs = cpuset_attach_old_cs;
1523 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); 1456 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1524 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1457 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1525 1458
@@ -1533,7 +1466,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1533 1466
1534 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); 1467 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
1535 1468
1536 cgroup_taskset_for_each(task, css, tset) { 1469 cgroup_taskset_for_each(task, tset) {
1537 /* 1470 /*
1538 * can_attach beforehand should guarantee that this doesn't 1471 * can_attach beforehand should guarantee that this doesn't
1539 * fail. TODO: have a better way to handle failure here 1472 * fail. TODO: have a better way to handle failure here
@@ -1673,7 +1606,7 @@ out_unlock:
1673 * Common handling for a write to a "cpus" or "mems" file. 1606 * Common handling for a write to a "cpus" or "mems" file.
1674 */ 1607 */
1675static int cpuset_write_resmask(struct cgroup_subsys_state *css, 1608static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1676 struct cftype *cft, const char *buf) 1609 struct cftype *cft, char *buf)
1677{ 1610{
1678 struct cpuset *cs = css_cs(css); 1611 struct cpuset *cs = css_cs(css);
1679 struct cpuset *trialcs; 1612 struct cpuset *trialcs;
@@ -2020,8 +1953,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
2020 kfree(cs); 1953 kfree(cs);
2021} 1954}
2022 1955
2023struct cgroup_subsys cpuset_subsys = { 1956struct cgroup_subsys cpuset_cgrp_subsys = {
2024 .name = "cpuset",
2025 .css_alloc = cpuset_css_alloc, 1957 .css_alloc = cpuset_css_alloc,
2026 .css_online = cpuset_css_online, 1958 .css_online = cpuset_css_online,
2027 .css_offline = cpuset_css_offline, 1959 .css_offline = cpuset_css_offline,
@@ -2029,7 +1961,6 @@ struct cgroup_subsys cpuset_subsys = {
2029 .can_attach = cpuset_can_attach, 1961 .can_attach = cpuset_can_attach,
2030 .cancel_attach = cpuset_cancel_attach, 1962 .cancel_attach = cpuset_cancel_attach,
2031 .attach = cpuset_attach, 1963 .attach = cpuset_attach,
2032 .subsys_id = cpuset_subsys_id,
2033 .base_cftypes = files, 1964 .base_cftypes = files,
2034 .early_init = 1, 1965 .early_init = 1,
2035}; 1966};
@@ -2086,10 +2017,9 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2086 parent = parent_cs(parent); 2017 parent = parent_cs(parent);
2087 2018
2088 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { 2019 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2089 rcu_read_lock(); 2020 printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset ");
2090 printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n", 2021 pr_cont_cgroup_name(cs->css.cgroup);
2091 cgroup_name(cs->css.cgroup)); 2022 pr_cont("\n");
2092 rcu_read_unlock();
2093 } 2023 }
2094} 2024}
2095 2025
@@ -2137,7 +2067,7 @@ retry:
2137 */ 2067 */
2138 if ((sane && cpumask_empty(cs->cpus_allowed)) || 2068 if ((sane && cpumask_empty(cs->cpus_allowed)) ||
2139 (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) 2069 (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
2140 update_tasks_cpumask(cs, NULL); 2070 update_tasks_cpumask(cs);
2141 2071
2142 mutex_lock(&callback_mutex); 2072 mutex_lock(&callback_mutex);
2143 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); 2073 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
@@ -2151,7 +2081,7 @@ retry:
2151 */ 2081 */
2152 if ((sane && nodes_empty(cs->mems_allowed)) || 2082 if ((sane && nodes_empty(cs->mems_allowed)) ||
2153 (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) 2083 (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
2154 update_tasks_nodemask(cs, NULL); 2084 update_tasks_nodemask(cs);
2155 2085
2156 is_empty = cpumask_empty(cs->cpus_allowed) || 2086 is_empty = cpumask_empty(cs->cpus_allowed) ||
2157 nodes_empty(cs->mems_allowed); 2087 nodes_empty(cs->mems_allowed);
@@ -2213,7 +2143,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2213 mutex_lock(&callback_mutex); 2143 mutex_lock(&callback_mutex);
2214 top_cpuset.mems_allowed = new_mems; 2144 top_cpuset.mems_allowed = new_mems;
2215 mutex_unlock(&callback_mutex); 2145 mutex_unlock(&callback_mutex);
2216 update_tasks_nodemask(&top_cpuset, NULL); 2146 update_tasks_nodemask(&top_cpuset);
2217 } 2147 }
2218 2148
2219 mutex_unlock(&cpuset_mutex); 2149 mutex_unlock(&cpuset_mutex);
@@ -2305,10 +2235,10 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2305 struct cpuset *cpus_cs; 2235 struct cpuset *cpus_cs;
2306 2236
2307 mutex_lock(&callback_mutex); 2237 mutex_lock(&callback_mutex);
2308 task_lock(tsk); 2238 rcu_read_lock();
2309 cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2239 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
2310 guarantee_online_cpus(cpus_cs, pmask); 2240 guarantee_online_cpus(cpus_cs, pmask);
2311 task_unlock(tsk); 2241 rcu_read_unlock();
2312 mutex_unlock(&callback_mutex); 2242 mutex_unlock(&callback_mutex);
2313} 2243}
2314 2244
@@ -2361,10 +2291,10 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2361 nodemask_t mask; 2291 nodemask_t mask;
2362 2292
2363 mutex_lock(&callback_mutex); 2293 mutex_lock(&callback_mutex);
2364 task_lock(tsk); 2294 rcu_read_lock();
2365 mems_cs = effective_nodemask_cpuset(task_cs(tsk)); 2295 mems_cs = effective_nodemask_cpuset(task_cs(tsk));
2366 guarantee_online_mems(mems_cs, &mask); 2296 guarantee_online_mems(mems_cs, &mask);
2367 task_unlock(tsk); 2297 rcu_read_unlock();
2368 mutex_unlock(&callback_mutex); 2298 mutex_unlock(&callback_mutex);
2369 2299
2370 return mask; 2300 return mask;
@@ -2480,10 +2410,10 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2480 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 2410 /* Not hardwall and node outside mems_allowed: scan up cpusets */
2481 mutex_lock(&callback_mutex); 2411 mutex_lock(&callback_mutex);
2482 2412
2483 task_lock(current); 2413 rcu_read_lock();
2484 cs = nearest_hardwall_ancestor(task_cs(current)); 2414 cs = nearest_hardwall_ancestor(task_cs(current));
2485 allowed = node_isset(node, cs->mems_allowed); 2415 allowed = node_isset(node, cs->mems_allowed);
2486 task_unlock(current); 2416 rcu_read_unlock();
2487 2417
2488 mutex_unlock(&callback_mutex); 2418 mutex_unlock(&callback_mutex);
2489 return allowed; 2419 return allowed;
@@ -2609,27 +2539,27 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2609 * @task: pointer to task_struct of some task. 2539 * @task: pointer to task_struct of some task.
2610 * 2540 *
2611 * Description: Prints @task's name, cpuset name, and cached copy of its 2541 * Description: Prints @task's name, cpuset name, and cached copy of its
2612 * mems_allowed to the kernel log. Must hold task_lock(task) to allow 2542 * mems_allowed to the kernel log.
2613 * dereferencing task_cs(task).
2614 */ 2543 */
2615void cpuset_print_task_mems_allowed(struct task_struct *tsk) 2544void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2616{ 2545{
2617 /* Statically allocated to prevent using excess stack. */ 2546 /* Statically allocated to prevent using excess stack. */
2618 static char cpuset_nodelist[CPUSET_NODELIST_LEN]; 2547 static char cpuset_nodelist[CPUSET_NODELIST_LEN];
2619 static DEFINE_SPINLOCK(cpuset_buffer_lock); 2548 static DEFINE_SPINLOCK(cpuset_buffer_lock);
2549 struct cgroup *cgrp;
2620 2550
2621 struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
2622
2623 rcu_read_lock();
2624 spin_lock(&cpuset_buffer_lock); 2551 spin_lock(&cpuset_buffer_lock);
2552 rcu_read_lock();
2625 2553
2554 cgrp = task_cs(tsk)->css.cgroup;
2626 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, 2555 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2627 tsk->mems_allowed); 2556 tsk->mems_allowed);
2628 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", 2557 printk(KERN_INFO "%s cpuset=", tsk->comm);
2629 tsk->comm, cgroup_name(cgrp), cpuset_nodelist); 2558 pr_cont_cgroup_name(cgrp);
2559 pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
2630 2560
2631 spin_unlock(&cpuset_buffer_lock);
2632 rcu_read_unlock(); 2561 rcu_read_unlock();
2562 spin_unlock(&cpuset_buffer_lock);
2633} 2563}
2634 2564
2635/* 2565/*
@@ -2660,9 +2590,9 @@ int cpuset_memory_pressure_enabled __read_mostly;
2660 2590
2661void __cpuset_memory_pressure_bump(void) 2591void __cpuset_memory_pressure_bump(void)
2662{ 2592{
2663 task_lock(current); 2593 rcu_read_lock();
2664 fmeter_markevent(&task_cs(current)->fmeter); 2594 fmeter_markevent(&task_cs(current)->fmeter);
2665 task_unlock(current); 2595 rcu_read_unlock();
2666} 2596}
2667 2597
2668#ifdef CONFIG_PROC_PID_CPUSET 2598#ifdef CONFIG_PROC_PID_CPUSET
@@ -2679,12 +2609,12 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
2679{ 2609{
2680 struct pid *pid; 2610 struct pid *pid;
2681 struct task_struct *tsk; 2611 struct task_struct *tsk;
2682 char *buf; 2612 char *buf, *p;
2683 struct cgroup_subsys_state *css; 2613 struct cgroup_subsys_state *css;
2684 int retval; 2614 int retval;
2685 2615
2686 retval = -ENOMEM; 2616 retval = -ENOMEM;
2687 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 2617 buf = kmalloc(PATH_MAX, GFP_KERNEL);
2688 if (!buf) 2618 if (!buf)
2689 goto out; 2619 goto out;
2690 2620
@@ -2694,14 +2624,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
2694 if (!tsk) 2624 if (!tsk)
2695 goto out_free; 2625 goto out_free;
2696 2626
2627 retval = -ENAMETOOLONG;
2697 rcu_read_lock(); 2628 rcu_read_lock();
2698 css = task_css(tsk, cpuset_subsys_id); 2629 css = task_css(tsk, cpuset_cgrp_id);
2699 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); 2630 p = cgroup_path(css->cgroup, buf, PATH_MAX);
2700 rcu_read_unlock(); 2631 rcu_read_unlock();
2701 if (retval < 0) 2632 if (!p)
2702 goto out_put_task; 2633 goto out_put_task;
2703 seq_puts(m, buf); 2634 seq_puts(m, p);
2704 seq_putc(m, '\n'); 2635 seq_putc(m, '\n');
2636 retval = 0;
2705out_put_task: 2637out_put_task:
2706 put_task_struct(tsk); 2638 put_task_struct(tsk);
2707out_free: 2639out_free:
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 334b3980ffc1..99982a70ddad 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -1035,7 +1035,7 @@ int dbg_io_get_char(void)
1035 * otherwise as a quick means to stop program execution and "break" into 1035 * otherwise as a quick means to stop program execution and "break" into
1036 * the debugger. 1036 * the debugger.
1037 */ 1037 */
1038void kgdb_breakpoint(void) 1038noinline void kgdb_breakpoint(void)
1039{ 1039{
1040 atomic_inc(&kgdb_setting_breakpoint); 1040 atomic_inc(&kgdb_setting_breakpoint);
1041 wmb(); /* Sync point before breakpoint */ 1041 wmb(); /* Sync point before breakpoint */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fa0b2d4ad83c..f83a71a3e46d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -231,11 +231,29 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
231#define NR_ACCUMULATED_SAMPLES 128 231#define NR_ACCUMULATED_SAMPLES 128
232static DEFINE_PER_CPU(u64, running_sample_length); 232static DEFINE_PER_CPU(u64, running_sample_length);
233 233
234void perf_sample_event_took(u64 sample_len_ns) 234static void perf_duration_warn(struct irq_work *w)
235{ 235{
236 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
236 u64 avg_local_sample_len; 237 u64 avg_local_sample_len;
237 u64 local_samples_len; 238 u64 local_samples_len;
239
240 local_samples_len = __get_cpu_var(running_sample_length);
241 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
242
243 printk_ratelimited(KERN_WARNING
244 "perf interrupt took too long (%lld > %lld), lowering "
245 "kernel.perf_event_max_sample_rate to %d\n",
246 avg_local_sample_len, allowed_ns >> 1,
247 sysctl_perf_event_sample_rate);
248}
249
250static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
251
252void perf_sample_event_took(u64 sample_len_ns)
253{
238 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); 254 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
255 u64 avg_local_sample_len;
256 u64 local_samples_len;
239 257
240 if (allowed_ns == 0) 258 if (allowed_ns == 0)
241 return; 259 return;
@@ -263,13 +281,14 @@ void perf_sample_event_took(u64 sample_len_ns)
263 sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; 281 sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
264 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; 282 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
265 283
266 printk_ratelimited(KERN_WARNING
267 "perf samples too long (%lld > %lld), lowering "
268 "kernel.perf_event_max_sample_rate to %d\n",
269 avg_local_sample_len, allowed_ns,
270 sysctl_perf_event_sample_rate);
271
272 update_perf_cpu_limits(); 284 update_perf_cpu_limits();
285
286 if (!irq_work_queue(&perf_duration_work)) {
287 early_printk("perf interrupt took too long (%lld > %lld), lowering "
288 "kernel.perf_event_max_sample_rate to %d\n",
289 avg_local_sample_len, allowed_ns >> 1,
290 sysctl_perf_event_sample_rate);
291 }
273} 292}
274 293
275static atomic64_t perf_event_id; 294static atomic64_t perf_event_id;
@@ -342,7 +361,7 @@ struct perf_cgroup {
342static inline struct perf_cgroup * 361static inline struct perf_cgroup *
343perf_cgroup_from_task(struct task_struct *task) 362perf_cgroup_from_task(struct task_struct *task)
344{ 363{
345 return container_of(task_css(task, perf_subsys_id), 364 return container_of(task_css(task, perf_event_cgrp_id),
346 struct perf_cgroup, css); 365 struct perf_cgroup, css);
347} 366}
348 367
@@ -370,11 +389,6 @@ perf_cgroup_match(struct perf_event *event)
370 event->cgrp->css.cgroup); 389 event->cgrp->css.cgroup);
371} 390}
372 391
373static inline bool perf_tryget_cgroup(struct perf_event *event)
374{
375 return css_tryget(&event->cgrp->css);
376}
377
378static inline void perf_put_cgroup(struct perf_event *event) 392static inline void perf_put_cgroup(struct perf_event *event)
379{ 393{
380 css_put(&event->cgrp->css); 394 css_put(&event->cgrp->css);
@@ -593,9 +607,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
593 if (!f.file) 607 if (!f.file)
594 return -EBADF; 608 return -EBADF;
595 609
596 rcu_read_lock(); 610 css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys);
597
598 css = css_from_dir(f.file->f_dentry, &perf_subsys);
599 if (IS_ERR(css)) { 611 if (IS_ERR(css)) {
600 ret = PTR_ERR(css); 612 ret = PTR_ERR(css);
601 goto out; 613 goto out;
@@ -604,13 +616,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
604 cgrp = container_of(css, struct perf_cgroup, css); 616 cgrp = container_of(css, struct perf_cgroup, css);
605 event->cgrp = cgrp; 617 event->cgrp = cgrp;
606 618
607 /* must be done before we fput() the file */
608 if (!perf_tryget_cgroup(event)) {
609 event->cgrp = NULL;
610 ret = -ENOENT;
611 goto out;
612 }
613
614 /* 619 /*
615 * all events in a group must monitor 620 * all events in a group must monitor
616 * the same cgroup because a task belongs 621 * the same cgroup because a task belongs
@@ -621,7 +626,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
621 ret = -EINVAL; 626 ret = -EINVAL;
622 } 627 }
623out: 628out:
624 rcu_read_unlock();
625 fdput(f); 629 fdput(f);
626 return ret; 630 return ret;
627} 631}
@@ -1714,7 +1718,7 @@ group_sched_in(struct perf_event *group_event,
1714 struct perf_event_context *ctx) 1718 struct perf_event_context *ctx)
1715{ 1719{
1716 struct perf_event *event, *partial_group = NULL; 1720 struct perf_event *event, *partial_group = NULL;
1717 struct pmu *pmu = group_event->pmu; 1721 struct pmu *pmu = ctx->pmu;
1718 u64 now = ctx->time; 1722 u64 now = ctx->time;
1719 bool simulate = false; 1723 bool simulate = false;
1720 1724
@@ -2563,8 +2567,6 @@ static void perf_branch_stack_sched_in(struct task_struct *prev,
2563 if (cpuctx->ctx.nr_branch_stack > 0 2567 if (cpuctx->ctx.nr_branch_stack > 0
2564 && pmu->flush_branch_stack) { 2568 && pmu->flush_branch_stack) {
2565 2569
2566 pmu = cpuctx->ctx.pmu;
2567
2568 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 2570 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2569 2571
2570 perf_pmu_disable(pmu); 2572 perf_pmu_disable(pmu);
@@ -6294,7 +6296,7 @@ static int perf_event_idx_default(struct perf_event *event)
6294 * Ensures all contexts with the same task_ctx_nr have the same 6296 * Ensures all contexts with the same task_ctx_nr have the same
6295 * pmu_cpu_context too. 6297 * pmu_cpu_context too.
6296 */ 6298 */
6297static void *find_pmu_context(int ctxn) 6299static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
6298{ 6300{
6299 struct pmu *pmu; 6301 struct pmu *pmu;
6300 6302
@@ -8036,7 +8038,7 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css,
8036{ 8038{
8037 struct task_struct *task; 8039 struct task_struct *task;
8038 8040
8039 cgroup_taskset_for_each(task, css, tset) 8041 cgroup_taskset_for_each(task, tset)
8040 task_function_call(task, __perf_cgroup_move, task); 8042 task_function_call(task, __perf_cgroup_move, task);
8041} 8043}
8042 8044
@@ -8055,9 +8057,7 @@ static void perf_cgroup_exit(struct cgroup_subsys_state *css,
8055 task_function_call(task, __perf_cgroup_move, task); 8057 task_function_call(task, __perf_cgroup_move, task);
8056} 8058}
8057 8059
8058struct cgroup_subsys perf_subsys = { 8060struct cgroup_subsys perf_event_cgrp_subsys = {
8059 .name = "perf_event",
8060 .subsys_id = perf_subsys_id,
8061 .css_alloc = perf_cgroup_css_alloc, 8061 .css_alloc = perf_cgroup_css_alloc,
8062 .css_free = perf_cgroup_css_free, 8062 .css_free = perf_cgroup_css_free,
8063 .exit = perf_cgroup_exit, 8063 .exit = perf_cgroup_exit,
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 307d87c0991a..04709b66369d 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1804,6 +1804,11 @@ static bool handle_trampoline(struct pt_regs *regs)
1804 return true; 1804 return true;
1805} 1805}
1806 1806
1807bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
1808{
1809 return false;
1810}
1811
1807/* 1812/*
1808 * Run handler and ask thread to singlestep. 1813 * Run handler and ask thread to singlestep.
1809 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. 1814 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1858,7 +1863,11 @@ static void handle_swbp(struct pt_regs *regs)
1858 if (!get_utask()) 1863 if (!get_utask())
1859 goto out; 1864 goto out;
1860 1865
1866 if (arch_uprobe_ignore(&uprobe->arch, regs))
1867 goto out;
1868
1861 handler_chain(uprobe, regs); 1869 handler_chain(uprobe, regs);
1870
1862 if (can_skip_sstep(uprobe, regs)) 1871 if (can_skip_sstep(uprobe, regs))
1863 goto out; 1872 goto out;
1864 1873
diff --git a/kernel/exit.c b/kernel/exit.c
index 1e77fc645317..6480d1c85d7a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -797,7 +797,7 @@ void do_exit(long code)
797 */ 797 */
798 perf_event_exit_task(tsk); 798 perf_event_exit_task(tsk);
799 799
800 cgroup_exit(tsk, 1); 800 cgroup_exit(tsk);
801 801
802 if (group_dead) 802 if (group_dead)
803 disassociate_ctty(1); 803 disassociate_ctty(1);
diff --git a/kernel/extable.c b/kernel/extable.c
index 763faf037ec1..d8a6446adbcb 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -36,7 +36,7 @@ extern struct exception_table_entry __start___ex_table[];
36extern struct exception_table_entry __stop___ex_table[]; 36extern struct exception_table_entry __stop___ex_table[];
37 37
38/* Cleared by build time tools if the table is already sorted. */ 38/* Cleared by build time tools if the table is already sorted. */
39u32 __initdata main_extable_sort_needed = 1; 39u32 __initdata __visible main_extable_sort_needed = 1;
40 40
41/* Sort the kernel's built-in exception table */ 41/* Sort the kernel's built-in exception table */
42void __init sort_main_extable(void) 42void __init sort_main_extable(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index a17621c6cd42..abc45890f0a5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -237,6 +237,7 @@ void __put_task_struct(struct task_struct *tsk)
237 WARN_ON(atomic_read(&tsk->usage)); 237 WARN_ON(atomic_read(&tsk->usage));
238 WARN_ON(tsk == current); 238 WARN_ON(tsk == current);
239 239
240 task_numa_free(tsk);
240 security_task_free(tsk); 241 security_task_free(tsk);
241 exit_creds(tsk); 242 exit_creds(tsk);
242 delayacct_tsk_free(tsk); 243 delayacct_tsk_free(tsk);
@@ -1271,7 +1272,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1271 if (IS_ERR(p->mempolicy)) { 1272 if (IS_ERR(p->mempolicy)) {
1272 retval = PTR_ERR(p->mempolicy); 1273 retval = PTR_ERR(p->mempolicy);
1273 p->mempolicy = NULL; 1274 p->mempolicy = NULL;
1274 goto bad_fork_cleanup_cgroup; 1275 goto bad_fork_cleanup_threadgroup_lock;
1275 } 1276 }
1276 mpol_fix_fork_child_flag(p); 1277 mpol_fix_fork_child_flag(p);
1277#endif 1278#endif
@@ -1524,11 +1525,10 @@ bad_fork_cleanup_policy:
1524 perf_event_free_task(p); 1525 perf_event_free_task(p);
1525#ifdef CONFIG_NUMA 1526#ifdef CONFIG_NUMA
1526 mpol_put(p->mempolicy); 1527 mpol_put(p->mempolicy);
1527bad_fork_cleanup_cgroup: 1528bad_fork_cleanup_threadgroup_lock:
1528#endif 1529#endif
1529 if (clone_flags & CLONE_THREAD) 1530 if (clone_flags & CLONE_THREAD)
1530 threadgroup_change_end(current); 1531 threadgroup_change_end(current);
1531 cgroup_exit(p, 0);
1532 delayacct_tsk_free(p); 1532 delayacct_tsk_free(p);
1533 module_put(task_thread_info(p)->exec_domain->module); 1533 module_put(task_thread_info(p)->exec_domain->module);
1534bad_fork_cleanup_count: 1534bad_fork_cleanup_count:
diff --git a/kernel/futex.c b/kernel/futex.c
index 44a1261cb9ff..67dacaf93e56 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -157,7 +157,9 @@
157 * enqueue. 157 * enqueue.
158 */ 158 */
159 159
160#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
160int __read_mostly futex_cmpxchg_enabled; 161int __read_mostly futex_cmpxchg_enabled;
162#endif
161 163
162/* 164/*
163 * Futex flags used to encode options to functions and preserve them across 165 * Futex flags used to encode options to functions and preserve them across
@@ -234,6 +236,7 @@ static const struct futex_q futex_q_init = {
234 * waiting on a futex. 236 * waiting on a futex.
235 */ 237 */
236struct futex_hash_bucket { 238struct futex_hash_bucket {
239 atomic_t waiters;
237 spinlock_t lock; 240 spinlock_t lock;
238 struct plist_head chain; 241 struct plist_head chain;
239} ____cacheline_aligned_in_smp; 242} ____cacheline_aligned_in_smp;
@@ -253,22 +256,37 @@ static inline void futex_get_mm(union futex_key *key)
253 smp_mb__after_atomic_inc(); 256 smp_mb__after_atomic_inc();
254} 257}
255 258
256static inline bool hb_waiters_pending(struct futex_hash_bucket *hb) 259/*
260 * Reflects a new waiter being added to the waitqueue.
261 */
262static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
257{ 263{
258#ifdef CONFIG_SMP 264#ifdef CONFIG_SMP
265 atomic_inc(&hb->waiters);
259 /* 266 /*
260 * Tasks trying to enter the critical region are most likely 267 * Full barrier (A), see the ordering comment above.
261 * potential waiters that will be added to the plist. Ensure
262 * that wakers won't miss to-be-slept tasks in the window between
263 * the wait call and the actual plist_add.
264 */ 268 */
265 if (spin_is_locked(&hb->lock)) 269 smp_mb__after_atomic_inc();
266 return true; 270#endif
267 smp_rmb(); /* Make sure we check the lock state first */ 271}
272
273/*
274 * Reflects a waiter being removed from the waitqueue by wakeup
275 * paths.
276 */
277static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
278{
279#ifdef CONFIG_SMP
280 atomic_dec(&hb->waiters);
281#endif
282}
268 283
269 return !plist_head_empty(&hb->chain); 284static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
285{
286#ifdef CONFIG_SMP
287 return atomic_read(&hb->waiters);
270#else 288#else
271 return true; 289 return 1;
272#endif 290#endif
273} 291}
274 292
@@ -954,6 +972,7 @@ static void __unqueue_futex(struct futex_q *q)
954 972
955 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); 973 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
956 plist_del(&q->list, &hb->chain); 974 plist_del(&q->list, &hb->chain);
975 hb_waiters_dec(hb);
957} 976}
958 977
959/* 978/*
@@ -1257,7 +1276,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1257 */ 1276 */
1258 if (likely(&hb1->chain != &hb2->chain)) { 1277 if (likely(&hb1->chain != &hb2->chain)) {
1259 plist_del(&q->list, &hb1->chain); 1278 plist_del(&q->list, &hb1->chain);
1279 hb_waiters_dec(hb1);
1260 plist_add(&q->list, &hb2->chain); 1280 plist_add(&q->list, &hb2->chain);
1281 hb_waiters_inc(hb2);
1261 q->lock_ptr = &hb2->lock; 1282 q->lock_ptr = &hb2->lock;
1262 } 1283 }
1263 get_futex_key_refs(key2); 1284 get_futex_key_refs(key2);
@@ -1600,6 +1621,17 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1600 struct futex_hash_bucket *hb; 1621 struct futex_hash_bucket *hb;
1601 1622
1602 hb = hash_futex(&q->key); 1623 hb = hash_futex(&q->key);
1624
1625 /*
1626 * Increment the counter before taking the lock so that
1627 * a potential waker won't miss a to-be-slept task that is
1628 * waiting for the spinlock. This is safe as all queue_lock()
1629 * users end up calling queue_me(). Similarly, for housekeeping,
1630 * decrement the counter at queue_unlock() when some error has
1631 * occurred and we don't end up adding the task to the list.
1632 */
1633 hb_waiters_inc(hb);
1634
1603 q->lock_ptr = &hb->lock; 1635 q->lock_ptr = &hb->lock;
1604 1636
1605 spin_lock(&hb->lock); /* implies MB (A) */ 1637 spin_lock(&hb->lock); /* implies MB (A) */
@@ -1611,6 +1643,7 @@ queue_unlock(struct futex_hash_bucket *hb)
1611 __releases(&hb->lock) 1643 __releases(&hb->lock)
1612{ 1644{
1613 spin_unlock(&hb->lock); 1645 spin_unlock(&hb->lock);
1646 hb_waiters_dec(hb);
1614} 1647}
1615 1648
1616/** 1649/**
@@ -2342,6 +2375,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2342 * Unqueue the futex_q and determine which it was. 2375 * Unqueue the futex_q and determine which it was.
2343 */ 2376 */
2344 plist_del(&q->list, &hb->chain); 2377 plist_del(&q->list, &hb->chain);
2378 hb_waiters_dec(hb);
2345 2379
2346 /* Handle spurious wakeups gracefully */ 2380 /* Handle spurious wakeups gracefully */
2347 ret = -EWOULDBLOCK; 2381 ret = -EWOULDBLOCK;
@@ -2843,9 +2877,28 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
2843 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 2877 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
2844} 2878}
2845 2879
2846static int __init futex_init(void) 2880static void __init futex_detect_cmpxchg(void)
2847{ 2881{
2882#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
2848 u32 curval; 2883 u32 curval;
2884
2885 /*
2886 * This will fail and we want it. Some arch implementations do
2887 * runtime detection of the futex_atomic_cmpxchg_inatomic()
2888 * functionality. We want to know that before we call in any
2889 * of the complex code paths. Also we want to prevent
2890 * registration of robust lists in that case. NULL is
2891 * guaranteed to fault and we get -EFAULT on functional
2892 * implementation, the non-functional ones will return
2893 * -ENOSYS.
2894 */
2895 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2896 futex_cmpxchg_enabled = 1;
2897#endif
2898}
2899
2900static int __init futex_init(void)
2901{
2849 unsigned int futex_shift; 2902 unsigned int futex_shift;
2850 unsigned long i; 2903 unsigned long i;
2851 2904
@@ -2861,20 +2914,11 @@ static int __init futex_init(void)
2861 &futex_shift, NULL, 2914 &futex_shift, NULL,
2862 futex_hashsize, futex_hashsize); 2915 futex_hashsize, futex_hashsize);
2863 futex_hashsize = 1UL << futex_shift; 2916 futex_hashsize = 1UL << futex_shift;
2864 /* 2917
2865 * This will fail and we want it. Some arch implementations do 2918 futex_detect_cmpxchg();
2866 * runtime detection of the futex_atomic_cmpxchg_inatomic()
2867 * functionality. We want to know that before we call in any
2868 * of the complex code paths. Also we want to prevent
2869 * registration of robust lists in that case. NULL is
2870 * guaranteed to fault and we get -EFAULT on functional
2871 * implementation, the non-functional ones will return
2872 * -ENOSYS.
2873 */
2874 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2875 futex_cmpxchg_enabled = 1;
2876 2919
2877 for (i = 0; i < futex_hashsize; i++) { 2920 for (i = 0; i < futex_hashsize; i++) {
2921 atomic_set(&futex_queues[i].waiters, 0);
2878 plist_head_init(&futex_queues[i].chain); 2922 plist_head_init(&futex_queues[i].chain);
2879 spin_lock_init(&futex_queues[i].lock); 2923 spin_lock_init(&futex_queues[i].lock);
2880 } 2924 }
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index f9f44fd4d34d..55c8c9349cfe 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -183,7 +183,7 @@ COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
183 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 183 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
184 cmd == FUTEX_WAIT_BITSET || 184 cmd == FUTEX_WAIT_BITSET ||
185 cmd == FUTEX_WAIT_REQUEUE_PI)) { 185 cmd == FUTEX_WAIT_REQUEUE_PI)) {
186 if (get_compat_timespec(&ts, utime)) 186 if (compat_get_timespec(&ts, utime))
187 return -EFAULT; 187 return -EFAULT;
188 if (!timespec_valid(&ts)) 188 if (!timespec_valid(&ts))
189 return -EINVAL; 189 return -EINVAL;
diff --git a/kernel/groups.c b/kernel/groups.c
index 90cf1c38c8ea..451698f86cfa 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -157,17 +157,13 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
157 * set_groups - Change a group subscription in a set of credentials 157 * set_groups - Change a group subscription in a set of credentials
158 * @new: The newly prepared set of credentials to alter 158 * @new: The newly prepared set of credentials to alter
159 * @group_info: The group list to install 159 * @group_info: The group list to install
160 *
161 * Validate a group subscription and, if valid, insert it into a set
162 * of credentials.
163 */ 160 */
164int set_groups(struct cred *new, struct group_info *group_info) 161void set_groups(struct cred *new, struct group_info *group_info)
165{ 162{
166 put_group_info(new->group_info); 163 put_group_info(new->group_info);
167 groups_sort(group_info); 164 groups_sort(group_info);
168 get_group_info(group_info); 165 get_group_info(group_info);
169 new->group_info = group_info; 166 new->group_info = group_info;
170 return 0;
171} 167}
172 168
173EXPORT_SYMBOL(set_groups); 169EXPORT_SYMBOL(set_groups);
@@ -182,18 +178,12 @@ EXPORT_SYMBOL(set_groups);
182int set_current_groups(struct group_info *group_info) 178int set_current_groups(struct group_info *group_info)
183{ 179{
184 struct cred *new; 180 struct cred *new;
185 int ret;
186 181
187 new = prepare_creds(); 182 new = prepare_creds();
188 if (!new) 183 if (!new)
189 return -ENOMEM; 184 return -ENOMEM;
190 185
191 ret = set_groups(new, group_info); 186 set_groups(new, group_info);
192 if (ret < 0) {
193 abort_creds(new);
194 return ret;
195 }
196
197 return commit_creds(new); 187 return commit_creds(new);
198} 188}
199 189
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 09094361dce5..d55092ceee29 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -168,19 +168,6 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
168 } 168 }
169} 169}
170 170
171
172/*
173 * Get the preferred target CPU for NOHZ
174 */
175static int hrtimer_get_target(int this_cpu, int pinned)
176{
177#ifdef CONFIG_NO_HZ_COMMON
178 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
179 return get_nohz_timer_target();
180#endif
181 return this_cpu;
182}
183
184/* 171/*
185 * With HIGHRES=y we do not migrate the timer when it is expiring 172 * With HIGHRES=y we do not migrate the timer when it is expiring
186 * before the next event on the target cpu because we cannot reprogram 173 * before the next event on the target cpu because we cannot reprogram
@@ -214,7 +201,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
214 struct hrtimer_clock_base *new_base; 201 struct hrtimer_clock_base *new_base;
215 struct hrtimer_cpu_base *new_cpu_base; 202 struct hrtimer_cpu_base *new_cpu_base;
216 int this_cpu = smp_processor_id(); 203 int this_cpu = smp_processor_id();
217 int cpu = hrtimer_get_target(this_cpu, pinned); 204 int cpu = get_nohz_timer_target(pinned);
218 int basenum = base->index; 205 int basenum = base->index;
219 206
220again: 207again:
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 0b9c169d577f..06bb1417b063 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -246,5 +246,4 @@ static int __init hung_task_init(void)
246 246
247 return 0; 247 return 0;
248} 248}
249 249subsys_initcall(hung_task_init);
250module_init(hung_task_init);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index dc04c166c54d..6397df2d6945 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -281,6 +281,19 @@ void unmask_irq(struct irq_desc *desc)
281 } 281 }
282} 282}
283 283
284void unmask_threaded_irq(struct irq_desc *desc)
285{
286 struct irq_chip *chip = desc->irq_data.chip;
287
288 if (chip->flags & IRQCHIP_EOI_THREADED)
289 chip->irq_eoi(&desc->irq_data);
290
291 if (chip->irq_unmask) {
292 chip->irq_unmask(&desc->irq_data);
293 irq_state_clr_masked(desc);
294 }
295}
296
284/* 297/*
285 * handle_nested_irq - Handle a nested irq from a irq thread 298 * handle_nested_irq - Handle a nested irq from a irq thread
286 * @irq: the interrupt number 299 * @irq: the interrupt number
@@ -435,6 +448,27 @@ static inline void preflow_handler(struct irq_desc *desc)
435static inline void preflow_handler(struct irq_desc *desc) { } 448static inline void preflow_handler(struct irq_desc *desc) { }
436#endif 449#endif
437 450
451static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
452{
453 if (!(desc->istate & IRQS_ONESHOT)) {
454 chip->irq_eoi(&desc->irq_data);
455 return;
456 }
457 /*
458 * We need to unmask in the following cases:
459 * - Oneshot irq which did not wake the thread (caused by a
460 * spurious interrupt or a primary handler handling it
461 * completely).
462 */
463 if (!irqd_irq_disabled(&desc->irq_data) &&
464 irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) {
465 chip->irq_eoi(&desc->irq_data);
466 unmask_irq(desc);
467 } else if (!(chip->flags & IRQCHIP_EOI_THREADED)) {
468 chip->irq_eoi(&desc->irq_data);
469 }
470}
471
438/** 472/**
439 * handle_fasteoi_irq - irq handler for transparent controllers 473 * handle_fasteoi_irq - irq handler for transparent controllers
440 * @irq: the interrupt number 474 * @irq: the interrupt number
@@ -448,6 +482,8 @@ static inline void preflow_handler(struct irq_desc *desc) { }
448void 482void
449handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) 483handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
450{ 484{
485 struct irq_chip *chip = desc->irq_data.chip;
486
451 raw_spin_lock(&desc->lock); 487 raw_spin_lock(&desc->lock);
452 488
453 if (unlikely(irqd_irq_inprogress(&desc->irq_data))) 489 if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
@@ -473,18 +509,14 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
473 preflow_handler(desc); 509 preflow_handler(desc);
474 handle_irq_event(desc); 510 handle_irq_event(desc);
475 511
476 if (desc->istate & IRQS_ONESHOT) 512 cond_unmask_eoi_irq(desc, chip);
477 cond_unmask_irq(desc);
478 513
479out_eoi:
480 desc->irq_data.chip->irq_eoi(&desc->irq_data);
481out_unlock:
482 raw_spin_unlock(&desc->lock); 514 raw_spin_unlock(&desc->lock);
483 return; 515 return;
484out: 516out:
485 if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED)) 517 if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
486 goto out_eoi; 518 chip->irq_eoi(&desc->irq_data);
487 goto out_unlock; 519 raw_spin_unlock(&desc->lock);
488} 520}
489 521
490/** 522/**
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 131ca176b497..635480270858 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -41,6 +41,7 @@ irqreturn_t no_action(int cpl, void *dev_id)
41{ 41{
42 return IRQ_NONE; 42 return IRQ_NONE;
43} 43}
44EXPORT_SYMBOL_GPL(no_action);
44 45
45static void warn_no_thread(unsigned int irq, struct irqaction *action) 46static void warn_no_thread(unsigned int irq, struct irqaction *action)
46{ 47{
@@ -51,7 +52,7 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
51 "but no thread function available.", irq, action->name); 52 "but no thread function available.", irq, action->name);
52} 53}
53 54
54static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) 55void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
55{ 56{
56 /* 57 /*
57 * In case the thread crashed and was killed we just pretend that 58 * In case the thread crashed and was killed we just pretend that
@@ -157,7 +158,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
157 break; 158 break;
158 } 159 }
159 160
160 irq_wake_thread(desc, action); 161 __irq_wake_thread(desc, action);
161 162
162 /* Fall through to add to randomness */ 163 /* Fall through to add to randomness */
163 case IRQ_HANDLED: 164 case IRQ_HANDLED:
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 001fa5bab490..ddf1ffeb79f1 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -6,6 +6,7 @@
6 * of this file for your non core code. 6 * of this file for your non core code.
7 */ 7 */
8#include <linux/irqdesc.h> 8#include <linux/irqdesc.h>
9#include <linux/kernel_stat.h>
9 10
10#ifdef CONFIG_SPARSE_IRQ 11#ifdef CONFIG_SPARSE_IRQ
11# define IRQ_BITMAP_BITS (NR_IRQS + 8196) 12# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
@@ -73,6 +74,7 @@ extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
73extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu); 74extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
74extern void mask_irq(struct irq_desc *desc); 75extern void mask_irq(struct irq_desc *desc);
75extern void unmask_irq(struct irq_desc *desc); 76extern void unmask_irq(struct irq_desc *desc);
77extern void unmask_threaded_irq(struct irq_desc *desc);
76 78
77extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 79extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
78 80
@@ -82,6 +84,7 @@ irqreturn_t handle_irq_event(struct irq_desc *desc);
82/* Resending of interrupts :*/ 84/* Resending of interrupts :*/
83void check_irq_resend(struct irq_desc *desc, unsigned int irq); 85void check_irq_resend(struct irq_desc *desc, unsigned int irq);
84bool irq_wait_for_poll(struct irq_desc *desc); 86bool irq_wait_for_poll(struct irq_desc *desc);
87void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
85 88
86#ifdef CONFIG_PROC_FS 89#ifdef CONFIG_PROC_FS
87extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 90extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
@@ -179,3 +182,9 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
179{ 182{
180 return d->state_use_accessors & mask; 183 return d->state_use_accessors & mask;
181} 184}
185
186static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc)
187{
188 __this_cpu_inc(*desc->kstat_irqs);
189 __this_cpu_inc(kstat.irqs_sum);
190}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 8ab8e9390297..a7174617616b 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -489,6 +489,11 @@ void dynamic_irq_cleanup(unsigned int irq)
489 raw_spin_unlock_irqrestore(&desc->lock, flags); 489 raw_spin_unlock_irqrestore(&desc->lock, flags);
490} 490}
491 491
492void kstat_incr_irq_this_cpu(unsigned int irq)
493{
494 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
495}
496
492unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) 497unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
493{ 498{
494 struct irq_desc *desc = irq_to_desc(irq); 499 struct irq_desc *desc = irq_to_desc(irq);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d3bf660cb57f..2486a4c1a710 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -32,24 +32,10 @@ static int __init setup_forced_irqthreads(char *arg)
32early_param("threadirqs", setup_forced_irqthreads); 32early_param("threadirqs", setup_forced_irqthreads);
33#endif 33#endif
34 34
35/** 35static void __synchronize_hardirq(struct irq_desc *desc)
36 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
37 * @irq: interrupt number to wait for
38 *
39 * This function waits for any pending IRQ handlers for this interrupt
40 * to complete before returning. If you use this function while
41 * holding a resource the IRQ handler may need you will deadlock.
42 *
43 * This function may be called - with care - from IRQ context.
44 */
45void synchronize_irq(unsigned int irq)
46{ 36{
47 struct irq_desc *desc = irq_to_desc(irq);
48 bool inprogress; 37 bool inprogress;
49 38
50 if (!desc)
51 return;
52
53 do { 39 do {
54 unsigned long flags; 40 unsigned long flags;
55 41
@@ -67,12 +53,56 @@ void synchronize_irq(unsigned int irq)
67 53
68 /* Oops, that failed? */ 54 /* Oops, that failed? */
69 } while (inprogress); 55 } while (inprogress);
56}
70 57
71 /* 58/**
72 * We made sure that no hardirq handler is running. Now verify 59 * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs)
73 * that no threaded handlers are active. 60 * @irq: interrupt number to wait for
74 */ 61 *
75 wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active)); 62 * This function waits for any pending hard IRQ handlers for this
63 * interrupt to complete before returning. If you use this
64 * function while holding a resource the IRQ handler may need you
65 * will deadlock. It does not take associated threaded handlers
66 * into account.
67 *
68 * Do not use this for shutdown scenarios where you must be sure
69 * that all parts (hardirq and threaded handler) have completed.
70 *
71 * This function may be called - with care - from IRQ context.
72 */
73void synchronize_hardirq(unsigned int irq)
74{
75 struct irq_desc *desc = irq_to_desc(irq);
76
77 if (desc)
78 __synchronize_hardirq(desc);
79}
80EXPORT_SYMBOL(synchronize_hardirq);
81
82/**
83 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
84 * @irq: interrupt number to wait for
85 *
86 * This function waits for any pending IRQ handlers for this interrupt
87 * to complete before returning. If you use this function while
88 * holding a resource the IRQ handler may need you will deadlock.
89 *
90 * This function may be called - with care - from IRQ context.
91 */
92void synchronize_irq(unsigned int irq)
93{
94 struct irq_desc *desc = irq_to_desc(irq);
95
96 if (desc) {
97 __synchronize_hardirq(desc);
98 /*
99 * We made sure that no hardirq handler is
100 * running. Now verify that no threaded handlers are
101 * active.
102 */
103 wait_event(desc->wait_for_threads,
104 !atomic_read(&desc->threads_active));
105 }
76} 106}
77EXPORT_SYMBOL(synchronize_irq); 107EXPORT_SYMBOL(synchronize_irq);
78 108
@@ -718,7 +748,7 @@ again:
718 748
719 if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && 749 if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) &&
720 irqd_irq_masked(&desc->irq_data)) 750 irqd_irq_masked(&desc->irq_data))
721 unmask_irq(desc); 751 unmask_threaded_irq(desc);
722 752
723out_unlock: 753out_unlock:
724 raw_spin_unlock_irq(&desc->lock); 754 raw_spin_unlock_irq(&desc->lock);
@@ -727,7 +757,7 @@ out_unlock:
727 757
728#ifdef CONFIG_SMP 758#ifdef CONFIG_SMP
729/* 759/*
730 * Check whether we need to chasnge the affinity of the interrupt thread. 760 * Check whether we need to change the affinity of the interrupt thread.
731 */ 761 */
732static void 762static void
733irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) 763irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
@@ -880,6 +910,33 @@ static int irq_thread(void *data)
880 return 0; 910 return 0;
881} 911}
882 912
913/**
914 * irq_wake_thread - wake the irq thread for the action identified by dev_id
915 * @irq: Interrupt line
916 * @dev_id: Device identity for which the thread should be woken
917 *
918 */
919void irq_wake_thread(unsigned int irq, void *dev_id)
920{
921 struct irq_desc *desc = irq_to_desc(irq);
922 struct irqaction *action;
923 unsigned long flags;
924
925 if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
926 return;
927
928 raw_spin_lock_irqsave(&desc->lock, flags);
929 for (action = desc->action; action; action = action->next) {
930 if (action->dev_id == dev_id) {
931 if (action->thread)
932 __irq_wake_thread(desc, action);
933 break;
934 }
935 }
936 raw_spin_unlock_irqrestore(&desc->lock, flags);
937}
938EXPORT_SYMBOL_GPL(irq_wake_thread);
939
883static void irq_setup_forced_threading(struct irqaction *new) 940static void irq_setup_forced_threading(struct irqaction *new)
884{ 941{
885 if (!force_irqthreads) 942 if (!force_irqthreads)
@@ -896,6 +953,23 @@ static void irq_setup_forced_threading(struct irqaction *new)
896 } 953 }
897} 954}
898 955
956static int irq_request_resources(struct irq_desc *desc)
957{
958 struct irq_data *d = &desc->irq_data;
959 struct irq_chip *c = d->chip;
960
961 return c->irq_request_resources ? c->irq_request_resources(d) : 0;
962}
963
964static void irq_release_resources(struct irq_desc *desc)
965{
966 struct irq_data *d = &desc->irq_data;
967 struct irq_chip *c = d->chip;
968
969 if (c->irq_release_resources)
970 c->irq_release_resources(d);
971}
972
899/* 973/*
900 * Internal function to register an irqaction - typically used to 974 * Internal function to register an irqaction - typically used to
901 * allocate special interrupts that are part of the architecture. 975 * allocate special interrupts that are part of the architecture.
@@ -1091,6 +1165,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1091 } 1165 }
1092 1166
1093 if (!shared) { 1167 if (!shared) {
1168 ret = irq_request_resources(desc);
1169 if (ret) {
1170 pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
1171 new->name, irq, desc->irq_data.chip->name);
1172 goto out_mask;
1173 }
1174
1094 init_waitqueue_head(&desc->wait_for_threads); 1175 init_waitqueue_head(&desc->wait_for_threads);
1095 1176
1096 /* Setup the type (level, edge polarity) if configured: */ 1177 /* Setup the type (level, edge polarity) if configured: */
@@ -1261,8 +1342,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1261 *action_ptr = action->next; 1342 *action_ptr = action->next;
1262 1343
1263 /* If this was the last handler, shut down the IRQ line: */ 1344 /* If this was the last handler, shut down the IRQ line: */
1264 if (!desc->action) 1345 if (!desc->action) {
1265 irq_shutdown(desc); 1346 irq_shutdown(desc);
1347 irq_release_resources(desc);
1348 }
1266 1349
1267#ifdef CONFIG_SMP 1350#ifdef CONFIG_SMP
1268 /* make sure affinity_hint is cleaned up */ 1351 /* make sure affinity_hint is cleaned up */
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 36f6ee181b0c..ac1ba2f11032 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -324,15 +324,15 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
324 324
325#ifdef CONFIG_SMP 325#ifdef CONFIG_SMP
326 /* create /proc/irq/<irq>/smp_affinity */ 326 /* create /proc/irq/<irq>/smp_affinity */
327 proc_create_data("smp_affinity", 0600, desc->dir, 327 proc_create_data("smp_affinity", 0644, desc->dir,
328 &irq_affinity_proc_fops, (void *)(long)irq); 328 &irq_affinity_proc_fops, (void *)(long)irq);
329 329
330 /* create /proc/irq/<irq>/affinity_hint */ 330 /* create /proc/irq/<irq>/affinity_hint */
331 proc_create_data("affinity_hint", 0400, desc->dir, 331 proc_create_data("affinity_hint", 0444, desc->dir,
332 &irq_affinity_hint_proc_fops, (void *)(long)irq); 332 &irq_affinity_hint_proc_fops, (void *)(long)irq);
333 333
334 /* create /proc/irq/<irq>/smp_affinity_list */ 334 /* create /proc/irq/<irq>/smp_affinity_list */
335 proc_create_data("smp_affinity_list", 0600, desc->dir, 335 proc_create_data("smp_affinity_list", 0644, desc->dir,
336 &irq_affinity_list_proc_fops, (void *)(long)irq); 336 &irq_affinity_list_proc_fops, (void *)(long)irq);
337 337
338 proc_create_data("node", 0444, desc->dir, 338 proc_create_data("node", 0444, desc->dir,
@@ -372,7 +372,7 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action)
372static void register_default_affinity_proc(void) 372static void register_default_affinity_proc(void)
373{ 373{
374#ifdef CONFIG_SMP 374#ifdef CONFIG_SMP
375 proc_create("irq/default_smp_affinity", 0600, NULL, 375 proc_create("irq/default_smp_affinity", 0644, NULL,
376 &default_affinity_proc_fops); 376 &default_affinity_proc_fops);
377#endif 377#endif
378} 378}
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 55fcce6065cf..a82170e2fa78 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -61,11 +61,11 @@ void __weak arch_irq_work_raise(void)
61 * 61 *
62 * Can be re-enqueued while the callback is still in progress. 62 * Can be re-enqueued while the callback is still in progress.
63 */ 63 */
64void irq_work_queue(struct irq_work *work) 64bool irq_work_queue(struct irq_work *work)
65{ 65{
66 /* Only queue if not already pending */ 66 /* Only queue if not already pending */
67 if (!irq_work_claim(work)) 67 if (!irq_work_claim(work))
68 return; 68 return false;
69 69
70 /* Queue the entry and raise the IPI if needed. */ 70 /* Queue the entry and raise the IPI if needed. */
71 preempt_disable(); 71 preempt_disable();
@@ -83,6 +83,8 @@ void irq_work_queue(struct irq_work *work)
83 } 83 }
84 84
85 preempt_enable(); 85 preempt_enable();
86
87 return true;
86} 88}
87EXPORT_SYMBOL_GPL(irq_work_queue); 89EXPORT_SYMBOL_GPL(irq_work_queue);
88 90
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 60bafbed06ab..c0d261c7db7b 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1039,10 +1039,10 @@ void __weak crash_unmap_reserved_pages(void)
1039{} 1039{}
1040 1040
1041#ifdef CONFIG_COMPAT 1041#ifdef CONFIG_COMPAT
1042asmlinkage long compat_sys_kexec_load(unsigned long entry, 1042COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
1043 unsigned long nr_segments, 1043 compat_ulong_t, nr_segments,
1044 struct compat_kexec_segment __user *segments, 1044 struct compat_kexec_segment __user *, segments,
1045 unsigned long flags) 1045 compat_ulong_t, flags)
1046{ 1046{
1047 struct compat_kexec_segment in; 1047 struct compat_kexec_segment in;
1048 struct kexec_segment out, __user *ksegments; 1048 struct kexec_segment out, __user *ksegments;
@@ -1235,7 +1235,7 @@ static int __init crash_notes_memory_init(void)
1235 } 1235 }
1236 return 0; 1236 return 0;
1237} 1237}
1238module_init(crash_notes_memory_init) 1238subsys_initcall(crash_notes_memory_init);
1239 1239
1240 1240
1241/* 1241/*
@@ -1629,7 +1629,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1629 return 0; 1629 return 0;
1630} 1630}
1631 1631
1632module_init(crash_save_vmcoreinfo_init) 1632subsys_initcall(crash_save_vmcoreinfo_init);
1633 1633
1634/* 1634/*
1635 * Move into place and start executing a preloaded standalone 1635 * Move into place and start executing a preloaded standalone
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index d945a949760f..e660964086e2 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -19,6 +19,8 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/capability.h> 20#include <linux/capability.h>
21 21
22#include <linux/rcupdate.h> /* rcu_expedited */
23
22#define KERNEL_ATTR_RO(_name) \ 24#define KERNEL_ATTR_RO(_name) \
23static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 25static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
24 26
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b5ae3ee860a9..9a130ec06f7a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -217,7 +217,7 @@ int tsk_fork_get_node(struct task_struct *tsk)
217 if (tsk == kthreadd_task) 217 if (tsk == kthreadd_task)
218 return tsk->pref_node_fork; 218 return tsk->pref_node_fork;
219#endif 219#endif
220 return numa_node_id(); 220 return NUMA_NO_NODE;
221} 221}
222 222
223static void create_kthread(struct kthread_create_info *create) 223static void create_kthread(struct kthread_create_info *create)
@@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
369{ 369{
370 struct task_struct *p; 370 struct task_struct *p;
371 371
372 p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, 372 p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt,
373 cpu); 373 cpu);
374 if (IS_ERR(p)) 374 if (IS_ERR(p))
375 return p; 375 return p;
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index baab8e5e7f66..306a76b51e0f 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
1 1
2obj-y += mutex.o semaphore.o rwsem.o lglock.o 2obj-y += mutex.o semaphore.o rwsem.o lglock.o mcs_spinlock.o
3 3
4ifdef CONFIG_FUNCTION_TRACER 4ifdef CONFIG_FUNCTION_TRACER
5CFLAGS_REMOVE_lockdep.o = -pg 5CFLAGS_REMOVE_lockdep.o = -pg
@@ -23,3 +23,4 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
23obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o 23obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
24obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o 24obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
25obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o 25obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
26obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index eb8a54783fa0..b0e9467922e1 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1936,12 +1936,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1936 1936
1937 for (;;) { 1937 for (;;) {
1938 int distance = curr->lockdep_depth - depth + 1; 1938 int distance = curr->lockdep_depth - depth + 1;
1939 hlock = curr->held_locks + depth-1; 1939 hlock = curr->held_locks + depth - 1;
1940 /* 1940 /*
1941 * Only non-recursive-read entries get new dependencies 1941 * Only non-recursive-read entries get new dependencies
1942 * added: 1942 * added:
1943 */ 1943 */
1944 if (hlock->read != 2) { 1944 if (hlock->read != 2 && hlock->check) {
1945 if (!check_prev_add(curr, hlock, next, 1945 if (!check_prev_add(curr, hlock, next,
1946 distance, trylock_loop)) 1946 distance, trylock_loop))
1947 return 0; 1947 return 0;
@@ -2098,7 +2098,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
2098 * (If lookup_chain_cache() returns with 1 it acquires 2098 * (If lookup_chain_cache() returns with 1 it acquires
2099 * graph_lock for us) 2099 * graph_lock for us)
2100 */ 2100 */
2101 if (!hlock->trylock && (hlock->check == 2) && 2101 if (!hlock->trylock && hlock->check &&
2102 lookup_chain_cache(curr, hlock, chain_key)) { 2102 lookup_chain_cache(curr, hlock, chain_key)) {
2103 /* 2103 /*
2104 * Check whether last held lock: 2104 * Check whether last held lock:
@@ -2517,7 +2517,7 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
2517 2517
2518 BUG_ON(usage_bit >= LOCK_USAGE_STATES); 2518 BUG_ON(usage_bit >= LOCK_USAGE_STATES);
2519 2519
2520 if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys) 2520 if (!hlock->check)
2521 continue; 2521 continue;
2522 2522
2523 if (!mark_lock(curr, hlock, usage_bit)) 2523 if (!mark_lock(curr, hlock, usage_bit))
@@ -2557,7 +2557,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip)
2557 debug_atomic_inc(hardirqs_on_events); 2557 debug_atomic_inc(hardirqs_on_events);
2558} 2558}
2559 2559
2560void trace_hardirqs_on_caller(unsigned long ip) 2560__visible void trace_hardirqs_on_caller(unsigned long ip)
2561{ 2561{
2562 time_hardirqs_on(CALLER_ADDR0, ip); 2562 time_hardirqs_on(CALLER_ADDR0, ip);
2563 2563
@@ -2610,7 +2610,7 @@ EXPORT_SYMBOL(trace_hardirqs_on);
2610/* 2610/*
2611 * Hardirqs were disabled: 2611 * Hardirqs were disabled:
2612 */ 2612 */
2613void trace_hardirqs_off_caller(unsigned long ip) 2613__visible void trace_hardirqs_off_caller(unsigned long ip)
2614{ 2614{
2615 struct task_struct *curr = current; 2615 struct task_struct *curr = current;
2616 2616
@@ -3055,9 +3055,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3055 int class_idx; 3055 int class_idx;
3056 u64 chain_key; 3056 u64 chain_key;
3057 3057
3058 if (!prove_locking)
3059 check = 1;
3060
3061 if (unlikely(!debug_locks)) 3058 if (unlikely(!debug_locks))
3062 return 0; 3059 return 0;
3063 3060
@@ -3069,8 +3066,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3069 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 3066 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
3070 return 0; 3067 return 0;
3071 3068
3072 if (lock->key == &__lockdep_no_validate__) 3069 if (!prove_locking || lock->key == &__lockdep_no_validate__)
3073 check = 1; 3070 check = 0;
3074 3071
3075 if (subclass < NR_LOCKDEP_CACHING_CLASSES) 3072 if (subclass < NR_LOCKDEP_CACHING_CLASSES)
3076 class = lock->class_cache[subclass]; 3073 class = lock->class_cache[subclass];
@@ -3138,7 +3135,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3138 hlock->holdtime_stamp = lockstat_clock(); 3135 hlock->holdtime_stamp = lockstat_clock();
3139#endif 3136#endif
3140 3137
3141 if (check == 2 && !mark_irqflags(curr, hlock)) 3138 if (check && !mark_irqflags(curr, hlock))
3142 return 0; 3139 return 0;
3143 3140
3144 /* mark it as used: */ 3141 /* mark it as used: */
@@ -4191,7 +4188,7 @@ void debug_show_held_locks(struct task_struct *task)
4191} 4188}
4192EXPORT_SYMBOL_GPL(debug_show_held_locks); 4189EXPORT_SYMBOL_GPL(debug_show_held_locks);
4193 4190
4194void lockdep_sys_exit(void) 4191asmlinkage void lockdep_sys_exit(void)
4195{ 4192{
4196 struct task_struct *curr = current; 4193 struct task_struct *curr = current;
4197 4194
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
new file mode 100644
index 000000000000..f26b1a18e34e
--- /dev/null
+++ b/kernel/locking/locktorture.c
@@ -0,0 +1,452 @@
1/*
2 * Module-based torture test facility for locking
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 *
18 * Copyright (C) IBM Corporation, 2014
19 *
20 * Author: Paul E. McKenney <paulmck@us.ibm.com>
21 * Based on kernel/rcu/torture.c.
22 */
23#include <linux/types.h>
24#include <linux/kernel.h>
25#include <linux/init.h>
26#include <linux/module.h>
27#include <linux/kthread.h>
28#include <linux/err.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/interrupt.h>
32#include <linux/sched.h>
33#include <linux/atomic.h>
34#include <linux/bitops.h>
35#include <linux/completion.h>
36#include <linux/moduleparam.h>
37#include <linux/percpu.h>
38#include <linux/notifier.h>
39#include <linux/reboot.h>
40#include <linux/freezer.h>
41#include <linux/cpu.h>
42#include <linux/delay.h>
43#include <linux/stat.h>
44#include <linux/slab.h>
45#include <linux/trace_clock.h>
46#include <asm/byteorder.h>
47#include <linux/torture.h>
48
49MODULE_LICENSE("GPL");
50MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
51
52torture_param(int, nwriters_stress, -1,
53 "Number of write-locking stress-test threads");
54torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
55torture_param(int, onoff_interval, 0,
56 "Time between CPU hotplugs (s), 0=disable");
57torture_param(int, shuffle_interval, 3,
58 "Number of jiffies between shuffles, 0=disable");
59torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
60torture_param(int, stat_interval, 60,
61 "Number of seconds between stats printk()s");
62torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
63torture_param(bool, verbose, true,
64 "Enable verbose debugging printk()s");
65
66static char *torture_type = "spin_lock";
67module_param(torture_type, charp, 0444);
68MODULE_PARM_DESC(torture_type,
69 "Type of lock to torture (spin_lock, spin_lock_irq, ...)");
70
71static atomic_t n_lock_torture_errors;
72
73static struct task_struct *stats_task;
74static struct task_struct **writer_tasks;
75
76static int nrealwriters_stress;
77static bool lock_is_write_held;
78
79struct lock_writer_stress_stats {
80 long n_write_lock_fail;
81 long n_write_lock_acquired;
82};
83static struct lock_writer_stress_stats *lwsa;
84
85#if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE)
86#define LOCKTORTURE_RUNNABLE_INIT 1
87#else
88#define LOCKTORTURE_RUNNABLE_INIT 0
89#endif
90int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT;
91module_param(locktorture_runnable, int, 0444);
92MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot");
93
94/* Forward reference. */
95static void lock_torture_cleanup(void);
96
97/*
98 * Operations vector for selecting different types of tests.
99 */
100struct lock_torture_ops {
101 void (*init)(void);
102 int (*writelock)(void);
103 void (*write_delay)(struct torture_random_state *trsp);
104 void (*writeunlock)(void);
105 unsigned long flags;
106 const char *name;
107};
108
109static struct lock_torture_ops *cur_ops;
110
111/*
112 * Definitions for lock torture testing.
113 */
114
115static int torture_lock_busted_write_lock(void)
116{
117 return 0; /* BUGGY, do not use in real life!!! */
118}
119
120static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
121{
122 const unsigned long longdelay_us = 100;
123
124 /* We want a long delay occasionally to force massive contention. */
125 if (!(torture_random(trsp) %
126 (nrealwriters_stress * 2000 * longdelay_us)))
127 mdelay(longdelay_us);
128#ifdef CONFIG_PREEMPT
129 if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
130 preempt_schedule(); /* Allow test to be preempted. */
131#endif
132}
133
134static void torture_lock_busted_write_unlock(void)
135{
136 /* BUGGY, do not use in real life!!! */
137}
138
139static struct lock_torture_ops lock_busted_ops = {
140 .writelock = torture_lock_busted_write_lock,
141 .write_delay = torture_lock_busted_write_delay,
142 .writeunlock = torture_lock_busted_write_unlock,
143 .name = "lock_busted"
144};
145
146static DEFINE_SPINLOCK(torture_spinlock);
147
148static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
149{
150 spin_lock(&torture_spinlock);
151 return 0;
152}
153
154static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
155{
156 const unsigned long shortdelay_us = 2;
157 const unsigned long longdelay_us = 100;
158
159 /* We want a short delay mostly to emulate likely code, and
160 * we want a long delay occasionally to force massive contention.
161 */
162 if (!(torture_random(trsp) %
163 (nrealwriters_stress * 2000 * longdelay_us)))
164 mdelay(longdelay_us);
165 if (!(torture_random(trsp) %
166 (nrealwriters_stress * 2 * shortdelay_us)))
167 udelay(shortdelay_us);
168#ifdef CONFIG_PREEMPT
169 if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
170 preempt_schedule(); /* Allow test to be preempted. */
171#endif
172}
173
174static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
175{
176 spin_unlock(&torture_spinlock);
177}
178
179static struct lock_torture_ops spin_lock_ops = {
180 .writelock = torture_spin_lock_write_lock,
181 .write_delay = torture_spin_lock_write_delay,
182 .writeunlock = torture_spin_lock_write_unlock,
183 .name = "spin_lock"
184};
185
186static int torture_spin_lock_write_lock_irq(void)
187__acquires(torture_spinlock_irq)
188{
189 unsigned long flags;
190
191 spin_lock_irqsave(&torture_spinlock, flags);
192 cur_ops->flags = flags;
193 return 0;
194}
195
196static void torture_lock_spin_write_unlock_irq(void)
197__releases(torture_spinlock)
198{
199 spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags);
200}
201
202static struct lock_torture_ops spin_lock_irq_ops = {
203 .writelock = torture_spin_lock_write_lock_irq,
204 .write_delay = torture_spin_lock_write_delay,
205 .writeunlock = torture_lock_spin_write_unlock_irq,
206 .name = "spin_lock_irq"
207};
208
209/*
210 * Lock torture writer kthread. Repeatedly acquires and releases
211 * the lock, checking for duplicate acquisitions.
212 */
213static int lock_torture_writer(void *arg)
214{
215 struct lock_writer_stress_stats *lwsp = arg;
216 static DEFINE_TORTURE_RANDOM(rand);
217
218 VERBOSE_TOROUT_STRING("lock_torture_writer task started");
219 set_user_nice(current, 19);
220
221 do {
222 schedule_timeout_uninterruptible(1);
223 cur_ops->writelock();
224 if (WARN_ON_ONCE(lock_is_write_held))
225 lwsp->n_write_lock_fail++;
226 lock_is_write_held = 1;
227 lwsp->n_write_lock_acquired++;
228 cur_ops->write_delay(&rand);
229 lock_is_write_held = 0;
230 cur_ops->writeunlock();
231 stutter_wait("lock_torture_writer");
232 } while (!torture_must_stop());
233 torture_kthread_stopping("lock_torture_writer");
234 return 0;
235}
236
237/*
238 * Create an lock-torture-statistics message in the specified buffer.
239 */
240static void lock_torture_printk(char *page)
241{
242 bool fail = 0;
243 int i;
244 long max = 0;
245 long min = lwsa[0].n_write_lock_acquired;
246 long long sum = 0;
247
248 for (i = 0; i < nrealwriters_stress; i++) {
249 if (lwsa[i].n_write_lock_fail)
250 fail = true;
251 sum += lwsa[i].n_write_lock_acquired;
252 if (max < lwsa[i].n_write_lock_fail)
253 max = lwsa[i].n_write_lock_fail;
254 if (min > lwsa[i].n_write_lock_fail)
255 min = lwsa[i].n_write_lock_fail;
256 }
257 page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
258 page += sprintf(page,
259 "Writes: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
260 sum, max, min, max / 2 > min ? "???" : "",
261 fail, fail ? "!!!" : "");
262 if (fail)
263 atomic_inc(&n_lock_torture_errors);
264}
265
266/*
267 * Print torture statistics. Caller must ensure that there is only one
268 * call to this function at a given time!!! This is normally accomplished
269 * by relying on the module system to only have one copy of the module
270 * loaded, and then by giving the lock_torture_stats kthread full control
271 * (or the init/cleanup functions when lock_torture_stats thread is not
272 * running).
273 */
274static void lock_torture_stats_print(void)
275{
276 int size = nrealwriters_stress * 200 + 8192;
277 char *buf;
278
279 buf = kmalloc(size, GFP_KERNEL);
280 if (!buf) {
281 pr_err("lock_torture_stats_print: Out of memory, need: %d",
282 size);
283 return;
284 }
285 lock_torture_printk(buf);
286 pr_alert("%s", buf);
287 kfree(buf);
288}
289
290/*
291 * Periodically prints torture statistics, if periodic statistics printing
292 * was specified via the stat_interval module parameter.
293 *
294 * No need to worry about fullstop here, since this one doesn't reference
295 * volatile state or register callbacks.
296 */
297static int lock_torture_stats(void *arg)
298{
299 VERBOSE_TOROUT_STRING("lock_torture_stats task started");
300 do {
301 schedule_timeout_interruptible(stat_interval * HZ);
302 lock_torture_stats_print();
303 torture_shutdown_absorb("lock_torture_stats");
304 } while (!torture_must_stop());
305 torture_kthread_stopping("lock_torture_stats");
306 return 0;
307}
308
309static inline void
310lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
311 const char *tag)
312{
313 pr_alert("%s" TORTURE_FLAG
314 "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
315 torture_type, tag, nrealwriters_stress, stat_interval, verbose,
316 shuffle_interval, stutter, shutdown_secs,
317 onoff_interval, onoff_holdoff);
318}
319
320static void lock_torture_cleanup(void)
321{
322 int i;
323
324 if (torture_cleanup())
325 return;
326
327 if (writer_tasks) {
328 for (i = 0; i < nrealwriters_stress; i++)
329 torture_stop_kthread(lock_torture_writer,
330 writer_tasks[i]);
331 kfree(writer_tasks);
332 writer_tasks = NULL;
333 }
334
335 torture_stop_kthread(lock_torture_stats, stats_task);
336 lock_torture_stats_print(); /* -After- the stats thread is stopped! */
337
338 if (atomic_read(&n_lock_torture_errors))
339 lock_torture_print_module_parms(cur_ops,
340 "End of test: FAILURE");
341 else if (torture_onoff_failures())
342 lock_torture_print_module_parms(cur_ops,
343 "End of test: LOCK_HOTPLUG");
344 else
345 lock_torture_print_module_parms(cur_ops,
346 "End of test: SUCCESS");
347}
348
349static int __init lock_torture_init(void)
350{
351 int i;
352 int firsterr = 0;
353 static struct lock_torture_ops *torture_ops[] = {
354 &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops,
355 };
356
357 torture_init_begin(torture_type, verbose, &locktorture_runnable);
358
359 /* Process args and tell the world that the torturer is on the job. */
360 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
361 cur_ops = torture_ops[i];
362 if (strcmp(torture_type, cur_ops->name) == 0)
363 break;
364 }
365 if (i == ARRAY_SIZE(torture_ops)) {
366 pr_alert("lock-torture: invalid torture type: \"%s\"\n",
367 torture_type);
368 pr_alert("lock-torture types:");
369 for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
370 pr_alert(" %s", torture_ops[i]->name);
371 pr_alert("\n");
372 torture_init_end();
373 return -EINVAL;
374 }
375 if (cur_ops->init)
376 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
377
378 if (nwriters_stress >= 0)
379 nrealwriters_stress = nwriters_stress;
380 else
381 nrealwriters_stress = 2 * num_online_cpus();
382 lock_torture_print_module_parms(cur_ops, "Start of test");
383
384 /* Initialize the statistics so that each run gets its own numbers. */
385
386 lock_is_write_held = 0;
387 lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL);
388 if (lwsa == NULL) {
389 VERBOSE_TOROUT_STRING("lwsa: Out of memory");
390 firsterr = -ENOMEM;
391 goto unwind;
392 }
393 for (i = 0; i < nrealwriters_stress; i++) {
394 lwsa[i].n_write_lock_fail = 0;
395 lwsa[i].n_write_lock_acquired = 0;
396 }
397
398 /* Start up the kthreads. */
399
400 if (onoff_interval > 0) {
401 firsterr = torture_onoff_init(onoff_holdoff * HZ,
402 onoff_interval * HZ);
403 if (firsterr)
404 goto unwind;
405 }
406 if (shuffle_interval > 0) {
407 firsterr = torture_shuffle_init(shuffle_interval);
408 if (firsterr)
409 goto unwind;
410 }
411 if (shutdown_secs > 0) {
412 firsterr = torture_shutdown_init(shutdown_secs,
413 lock_torture_cleanup);
414 if (firsterr)
415 goto unwind;
416 }
417 if (stutter > 0) {
418 firsterr = torture_stutter_init(stutter);
419 if (firsterr)
420 goto unwind;
421 }
422
423 writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]),
424 GFP_KERNEL);
425 if (writer_tasks == NULL) {
426 VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
427 firsterr = -ENOMEM;
428 goto unwind;
429 }
430 for (i = 0; i < nrealwriters_stress; i++) {
431 firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i],
432 writer_tasks[i]);
433 if (firsterr)
434 goto unwind;
435 }
436 if (stat_interval > 0) {
437 firsterr = torture_create_kthread(lock_torture_stats, NULL,
438 stats_task);
439 if (firsterr)
440 goto unwind;
441 }
442 torture_init_end();
443 return 0;
444
445unwind:
446 torture_init_end();
447 lock_torture_cleanup();
448 return firsterr;
449}
450
451module_init(lock_torture_init);
452module_exit(lock_torture_cleanup);
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
new file mode 100644
index 000000000000..838dc9e00669
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.c
@@ -0,0 +1,178 @@
1
2#include <linux/percpu.h>
3#include <linux/mutex.h>
4#include <linux/sched.h>
5#include "mcs_spinlock.h"
6
7#ifdef CONFIG_SMP
8
9/*
10 * An MCS like lock especially tailored for optimistic spinning for sleeping
11 * lock implementations (mutex, rwsem, etc).
12 *
13 * Using a single mcs node per CPU is safe because sleeping locks should not be
14 * called from interrupt context and we have preemption disabled while
15 * spinning.
16 */
17static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node);
18
19/*
20 * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
21 * Can return NULL in case we were the last queued and we updated @lock instead.
22 */
23static inline struct optimistic_spin_queue *
24osq_wait_next(struct optimistic_spin_queue **lock,
25 struct optimistic_spin_queue *node,
26 struct optimistic_spin_queue *prev)
27{
28 struct optimistic_spin_queue *next = NULL;
29
30 for (;;) {
31 if (*lock == node && cmpxchg(lock, node, prev) == node) {
32 /*
33 * We were the last queued, we moved @lock back. @prev
34 * will now observe @lock and will complete its
35 * unlock()/unqueue().
36 */
37 break;
38 }
39
40 /*
41 * We must xchg() the @node->next value, because if we were to
42 * leave it in, a concurrent unlock()/unqueue() from
43 * @node->next might complete Step-A and think its @prev is
44 * still valid.
45 *
46 * If the concurrent unlock()/unqueue() wins the race, we'll
47 * wait for either @lock to point to us, through its Step-B, or
48 * wait for a new @node->next from its Step-C.
49 */
50 if (node->next) {
51 next = xchg(&node->next, NULL);
52 if (next)
53 break;
54 }
55
56 arch_mutex_cpu_relax();
57 }
58
59 return next;
60}
61
62bool osq_lock(struct optimistic_spin_queue **lock)
63{
64 struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
65 struct optimistic_spin_queue *prev, *next;
66
67 node->locked = 0;
68 node->next = NULL;
69
70 node->prev = prev = xchg(lock, node);
71 if (likely(prev == NULL))
72 return true;
73
74 ACCESS_ONCE(prev->next) = node;
75
76 /*
77 * Normally @prev is untouchable after the above store; because at that
78 * moment unlock can proceed and wipe the node element from stack.
79 *
80 * However, since our nodes are static per-cpu storage, we're
81 * guaranteed their existence -- this allows us to apply
82 * cmpxchg in an attempt to undo our queueing.
83 */
84
85 while (!smp_load_acquire(&node->locked)) {
86 /*
87 * If we need to reschedule bail... so we can block.
88 */
89 if (need_resched())
90 goto unqueue;
91
92 arch_mutex_cpu_relax();
93 }
94 return true;
95
96unqueue:
97 /*
98 * Step - A -- stabilize @prev
99 *
100 * Undo our @prev->next assignment; this will make @prev's
101 * unlock()/unqueue() wait for a next pointer since @lock points to us
102 * (or later).
103 */
104
105 for (;;) {
106 if (prev->next == node &&
107 cmpxchg(&prev->next, node, NULL) == node)
108 break;
109
110 /*
111 * We can only fail the cmpxchg() racing against an unlock(),
112 * in which case we should observe @node->locked becomming
113 * true.
114 */
115 if (smp_load_acquire(&node->locked))
116 return true;
117
118 arch_mutex_cpu_relax();
119
120 /*
121 * Or we race against a concurrent unqueue()'s step-B, in which
122 * case its step-C will write us a new @node->prev pointer.
123 */
124 prev = ACCESS_ONCE(node->prev);
125 }
126
127 /*
128 * Step - B -- stabilize @next
129 *
130 * Similar to unlock(), wait for @node->next or move @lock from @node
131 * back to @prev.
132 */
133
134 next = osq_wait_next(lock, node, prev);
135 if (!next)
136 return false;
137
138 /*
139 * Step - C -- unlink
140 *
141 * @prev is stable because its still waiting for a new @prev->next
142 * pointer, @next is stable because our @node->next pointer is NULL and
143 * it will wait in Step-A.
144 */
145
146 ACCESS_ONCE(next->prev) = prev;
147 ACCESS_ONCE(prev->next) = next;
148
149 return false;
150}
151
152void osq_unlock(struct optimistic_spin_queue **lock)
153{
154 struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
155 struct optimistic_spin_queue *next;
156
157 /*
158 * Fast path for the uncontended case.
159 */
160 if (likely(cmpxchg(lock, node, NULL) == node))
161 return;
162
163 /*
164 * Second most likely case.
165 */
166 next = xchg(&node->next, NULL);
167 if (next) {
168 ACCESS_ONCE(next->locked) = 1;
169 return;
170 }
171
172 next = osq_wait_next(lock, node, NULL);
173 if (next)
174 ACCESS_ONCE(next->locked) = 1;
175}
176
177#endif
178
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
new file mode 100644
index 000000000000..a2dbac4aca6b
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.h
@@ -0,0 +1,129 @@
1/*
2 * MCS lock defines
3 *
4 * This file contains the main data structure and API definitions of MCS lock.
5 *
6 * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
7 * with the desirable properties of being fair, and with each cpu trying
8 * to acquire the lock spinning on a local variable.
9 * It avoids expensive cache bouncings that common test-and-set spin-lock
10 * implementations incur.
11 */
12#ifndef __LINUX_MCS_SPINLOCK_H
13#define __LINUX_MCS_SPINLOCK_H
14
15#include <asm/mcs_spinlock.h>
16
17struct mcs_spinlock {
18 struct mcs_spinlock *next;
19 int locked; /* 1 if lock acquired */
20};
21
22#ifndef arch_mcs_spin_lock_contended
23/*
24 * Using smp_load_acquire() provides a memory barrier that ensures
25 * subsequent operations happen after the lock is acquired.
26 */
27#define arch_mcs_spin_lock_contended(l) \
28do { \
29 while (!(smp_load_acquire(l))) \
30 arch_mutex_cpu_relax(); \
31} while (0)
32#endif
33
34#ifndef arch_mcs_spin_unlock_contended
35/*
36 * smp_store_release() provides a memory barrier to ensure all
37 * operations in the critical section has been completed before
38 * unlocking.
39 */
40#define arch_mcs_spin_unlock_contended(l) \
41 smp_store_release((l), 1)
42#endif
43
44/*
45 * Note: the smp_load_acquire/smp_store_release pair is not
46 * sufficient to form a full memory barrier across
47 * cpus for many architectures (except x86) for mcs_unlock and mcs_lock.
48 * For applications that need a full barrier across multiple cpus
49 * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be
50 * used after mcs_lock.
51 */
52
53/*
54 * In order to acquire the lock, the caller should declare a local node and
55 * pass a reference of the node to this function in addition to the lock.
56 * If the lock has already been acquired, then this will proceed to spin
57 * on this node->locked until the previous lock holder sets the node->locked
58 * in mcs_spin_unlock().
59 *
60 * We don't inline mcs_spin_lock() so that perf can correctly account for the
61 * time spent in this lock function.
62 */
63static inline
64void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
65{
66 struct mcs_spinlock *prev;
67
68 /* Init node */
69 node->locked = 0;
70 node->next = NULL;
71
72 prev = xchg(lock, node);
73 if (likely(prev == NULL)) {
74 /*
75 * Lock acquired, don't need to set node->locked to 1. Threads
76 * only spin on its own node->locked value for lock acquisition.
77 * However, since this thread can immediately acquire the lock
78 * and does not proceed to spin on its own node->locked, this
79 * value won't be used. If a debug mode is needed to
80 * audit lock status, then set node->locked value here.
81 */
82 return;
83 }
84 ACCESS_ONCE(prev->next) = node;
85
86 /* Wait until the lock holder passes the lock down. */
87 arch_mcs_spin_lock_contended(&node->locked);
88}
89
90/*
91 * Releases the lock. The caller should pass in the corresponding node that
92 * was used to acquire the lock.
93 */
94static inline
95void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
96{
97 struct mcs_spinlock *next = ACCESS_ONCE(node->next);
98
99 if (likely(!next)) {
100 /*
101 * Release the lock by setting it to NULL
102 */
103 if (likely(cmpxchg(lock, node, NULL) == node))
104 return;
105 /* Wait until the next pointer is set */
106 while (!(next = ACCESS_ONCE(node->next)))
107 arch_mutex_cpu_relax();
108 }
109
110 /* Pass lock to next waiter. */
111 arch_mcs_spin_unlock_contended(&next->locked);
112}
113
114/*
115 * Cancellable version of the MCS lock above.
116 *
117 * Intended for adaptive spinning of sleeping locks:
118 * mutex_lock()/rwsem_down_{read,write}() etc.
119 */
120
121struct optimistic_spin_queue {
122 struct optimistic_spin_queue *next, *prev;
123 int locked; /* 1 if lock acquired */
124};
125
126extern bool osq_lock(struct optimistic_spin_queue **lock);
127extern void osq_unlock(struct optimistic_spin_queue **lock);
128
129#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index faf6f5b53e77..e1191c996c59 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -83,6 +83,12 @@ void debug_mutex_unlock(struct mutex *lock)
83 83
84 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 84 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
85 mutex_clear_owner(lock); 85 mutex_clear_owner(lock);
86
87 /*
88 * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
89 * mutexes so that we can do it here after we've verified state.
90 */
91 atomic_set(&lock->count, 1);
86} 92}
87 93
88void debug_mutex_init(struct mutex *lock, const char *name, 94void debug_mutex_init(struct mutex *lock, const char *name,
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4dd6e4c219de..bc73d33c6760 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -25,6 +25,7 @@
25#include <linux/spinlock.h> 25#include <linux/spinlock.h>
26#include <linux/interrupt.h> 26#include <linux/interrupt.h>
27#include <linux/debug_locks.h> 27#include <linux/debug_locks.h>
28#include "mcs_spinlock.h"
28 29
29/* 30/*
30 * In the DEBUG case we are using the "NULL fastpath" for mutexes, 31 * In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -33,6 +34,13 @@
33#ifdef CONFIG_DEBUG_MUTEXES 34#ifdef CONFIG_DEBUG_MUTEXES
34# include "mutex-debug.h" 35# include "mutex-debug.h"
35# include <asm-generic/mutex-null.h> 36# include <asm-generic/mutex-null.h>
37/*
38 * Must be 0 for the debug case so we do not do the unlock outside of the
39 * wait_lock region. debug_mutex_unlock() will do the actual unlock in this
40 * case.
41 */
42# undef __mutex_slowpath_needs_to_unlock
43# define __mutex_slowpath_needs_to_unlock() 0
36#else 44#else
37# include "mutex.h" 45# include "mutex.h"
38# include <asm/mutex.h> 46# include <asm/mutex.h>
@@ -52,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
52 INIT_LIST_HEAD(&lock->wait_list); 60 INIT_LIST_HEAD(&lock->wait_list);
53 mutex_clear_owner(lock); 61 mutex_clear_owner(lock);
54#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 62#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
55 lock->spin_mlock = NULL; 63 lock->osq = NULL;
56#endif 64#endif
57 65
58 debug_mutex_init(lock, name, key); 66 debug_mutex_init(lock, name, key);
@@ -67,8 +75,7 @@ EXPORT_SYMBOL(__mutex_init);
67 * We also put the fastpath first in the kernel image, to make sure the 75 * We also put the fastpath first in the kernel image, to make sure the
68 * branch is predicted by the CPU as default-untaken. 76 * branch is predicted by the CPU as default-untaken.
69 */ 77 */
70static __used noinline void __sched 78__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
71__mutex_lock_slowpath(atomic_t *lock_count);
72 79
73/** 80/**
74 * mutex_lock - acquire the mutex 81 * mutex_lock - acquire the mutex
@@ -111,54 +118,7 @@ EXPORT_SYMBOL(mutex_lock);
111 * more or less simultaneously, the spinners need to acquire a MCS lock 118 * more or less simultaneously, the spinners need to acquire a MCS lock
112 * first before spinning on the owner field. 119 * first before spinning on the owner field.
113 * 120 *
114 * We don't inline mspin_lock() so that perf can correctly account for the
115 * time spent in this lock function.
116 */ 121 */
117struct mspin_node {
118 struct mspin_node *next ;
119 int locked; /* 1 if lock acquired */
120};
121#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))
122
123static noinline
124void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
125{
126 struct mspin_node *prev;
127
128 /* Init node */
129 node->locked = 0;
130 node->next = NULL;
131
132 prev = xchg(lock, node);
133 if (likely(prev == NULL)) {
134 /* Lock acquired */
135 node->locked = 1;
136 return;
137 }
138 ACCESS_ONCE(prev->next) = node;
139 smp_wmb();
140 /* Wait until the lock holder passes the lock down */
141 while (!ACCESS_ONCE(node->locked))
142 arch_mutex_cpu_relax();
143}
144
145static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
146{
147 struct mspin_node *next = ACCESS_ONCE(node->next);
148
149 if (likely(!next)) {
150 /*
151 * Release the lock by setting it to NULL
152 */
153 if (cmpxchg(lock, node, NULL) == node)
154 return;
155 /* Wait until the next pointer is set */
156 while (!(next = ACCESS_ONCE(node->next)))
157 arch_mutex_cpu_relax();
158 }
159 ACCESS_ONCE(next->locked) = 1;
160 smp_wmb();
161}
162 122
163/* 123/*
164 * Mutex spinning code migrated from kernel/sched/core.c 124 * Mutex spinning code migrated from kernel/sched/core.c
@@ -212,6 +172,9 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
212 struct task_struct *owner; 172 struct task_struct *owner;
213 int retval = 1; 173 int retval = 1;
214 174
175 if (need_resched())
176 return 0;
177
215 rcu_read_lock(); 178 rcu_read_lock();
216 owner = ACCESS_ONCE(lock->owner); 179 owner = ACCESS_ONCE(lock->owner);
217 if (owner) 180 if (owner)
@@ -225,7 +188,8 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
225} 188}
226#endif 189#endif
227 190
228static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 191__visible __used noinline
192void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
229 193
230/** 194/**
231 * mutex_unlock - release the mutex 195 * mutex_unlock - release the mutex
@@ -446,9 +410,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
446 if (!mutex_can_spin_on_owner(lock)) 410 if (!mutex_can_spin_on_owner(lock))
447 goto slowpath; 411 goto slowpath;
448 412
413 if (!osq_lock(&lock->osq))
414 goto slowpath;
415
449 for (;;) { 416 for (;;) {
450 struct task_struct *owner; 417 struct task_struct *owner;
451 struct mspin_node node;
452 418
453 if (use_ww_ctx && ww_ctx->acquired > 0) { 419 if (use_ww_ctx && ww_ctx->acquired > 0) {
454 struct ww_mutex *ww; 420 struct ww_mutex *ww;
@@ -463,19 +429,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
463 * performed the optimistic spinning cannot be done. 429 * performed the optimistic spinning cannot be done.
464 */ 430 */
465 if (ACCESS_ONCE(ww->ctx)) 431 if (ACCESS_ONCE(ww->ctx))
466 goto slowpath; 432 break;
467 } 433 }
468 434
469 /* 435 /*
470 * If there's an owner, wait for it to either 436 * If there's an owner, wait for it to either
471 * release the lock or go to sleep. 437 * release the lock or go to sleep.
472 */ 438 */
473 mspin_lock(MLOCK(lock), &node);
474 owner = ACCESS_ONCE(lock->owner); 439 owner = ACCESS_ONCE(lock->owner);
475 if (owner && !mutex_spin_on_owner(lock, owner)) { 440 if (owner && !mutex_spin_on_owner(lock, owner))
476 mspin_unlock(MLOCK(lock), &node); 441 break;
477 goto slowpath;
478 }
479 442
480 if ((atomic_read(&lock->count) == 1) && 443 if ((atomic_read(&lock->count) == 1) &&
481 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { 444 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
@@ -488,11 +451,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
488 } 451 }
489 452
490 mutex_set_owner(lock); 453 mutex_set_owner(lock);
491 mspin_unlock(MLOCK(lock), &node); 454 osq_unlock(&lock->osq);
492 preempt_enable(); 455 preempt_enable();
493 return 0; 456 return 0;
494 } 457 }
495 mspin_unlock(MLOCK(lock), &node);
496 458
497 /* 459 /*
498 * When there's no owner, we might have preempted between the 460 * When there's no owner, we might have preempted between the
@@ -501,7 +463,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
501 * the owner complete. 463 * the owner complete.
502 */ 464 */
503 if (!owner && (need_resched() || rt_task(task))) 465 if (!owner && (need_resched() || rt_task(task)))
504 goto slowpath; 466 break;
505 467
506 /* 468 /*
507 * The cpu_relax() call is a compiler barrier which forces 469 * The cpu_relax() call is a compiler barrier which forces
@@ -511,7 +473,15 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
511 */ 473 */
512 arch_mutex_cpu_relax(); 474 arch_mutex_cpu_relax();
513 } 475 }
476 osq_unlock(&lock->osq);
514slowpath: 477slowpath:
478 /*
479 * If we fell out of the spin path because of need_resched(),
480 * reschedule now, before we try-lock the mutex. This avoids getting
481 * scheduled out right after we obtained the mutex.
482 */
483 if (need_resched())
484 schedule_preempt_disabled();
515#endif 485#endif
516 spin_lock_mutex(&lock->wait_lock, flags); 486 spin_lock_mutex(&lock->wait_lock, flags);
517 487
@@ -717,10 +687,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
717 struct mutex *lock = container_of(lock_count, struct mutex, count); 687 struct mutex *lock = container_of(lock_count, struct mutex, count);
718 unsigned long flags; 688 unsigned long flags;
719 689
720 spin_lock_mutex(&lock->wait_lock, flags);
721 mutex_release(&lock->dep_map, nested, _RET_IP_);
722 debug_mutex_unlock(lock);
723
724 /* 690 /*
725 * some architectures leave the lock unlocked in the fastpath failure 691 * some architectures leave the lock unlocked in the fastpath failure
726 * case, others need to leave it locked. In the later case we have to 692 * case, others need to leave it locked. In the later case we have to
@@ -729,6 +695,10 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
729 if (__mutex_slowpath_needs_to_unlock()) 695 if (__mutex_slowpath_needs_to_unlock())
730 atomic_set(&lock->count, 1); 696 atomic_set(&lock->count, 1);
731 697
698 spin_lock_mutex(&lock->wait_lock, flags);
699 mutex_release(&lock->dep_map, nested, _RET_IP_);
700 debug_mutex_unlock(lock);
701
732 if (!list_empty(&lock->wait_list)) { 702 if (!list_empty(&lock->wait_list)) {
733 /* get the first entry from the wait-list: */ 703 /* get the first entry from the wait-list: */
734 struct mutex_waiter *waiter = 704 struct mutex_waiter *waiter =
@@ -746,7 +716,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
746/* 716/*
747 * Release the lock, slowpath: 717 * Release the lock, slowpath:
748 */ 718 */
749static __used noinline void 719__visible void
750__mutex_unlock_slowpath(atomic_t *lock_count) 720__mutex_unlock_slowpath(atomic_t *lock_count)
751{ 721{
752 __mutex_unlock_common_slowpath(lock_count, 1); 722 __mutex_unlock_common_slowpath(lock_count, 1);
@@ -803,7 +773,7 @@ int __sched mutex_lock_killable(struct mutex *lock)
803} 773}
804EXPORT_SYMBOL(mutex_lock_killable); 774EXPORT_SYMBOL(mutex_lock_killable);
805 775
806static __used noinline void __sched 776__visible void __sched
807__mutex_lock_slowpath(atomic_t *lock_count) 777__mutex_lock_slowpath(atomic_t *lock_count)
808{ 778{
809 struct mutex *lock = container_of(lock_count, struct mutex, count); 779 struct mutex *lock = container_of(lock_count, struct mutex, count);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 2e960a2bab81..aa4dff04b594 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -213,6 +213,18 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
213} 213}
214 214
215/* 215/*
216 * Called by sched_setscheduler() to check whether the priority change
217 * is overruled by a possible priority boosting.
218 */
219int rt_mutex_check_prio(struct task_struct *task, int newprio)
220{
221 if (!task_has_pi_waiters(task))
222 return 0;
223
224 return task_top_pi_waiter(task)->task->prio <= newprio;
225}
226
227/*
216 * Adjust the priority of a task, after its pi_waiters got modified. 228 * Adjust the priority of a task, after its pi_waiters got modified.
217 * 229 *
218 * This can be both boosting and unboosting. task->pi_lock must be held. 230 * This can be both boosting and unboosting. task->pi_lock must be held.
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 19c5fa95e0b4..1d66e08e897d 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -143,6 +143,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
143/* 143/*
144 * wait for the read lock to be granted 144 * wait for the read lock to be granted
145 */ 145 */
146__visible
146struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) 147struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
147{ 148{
148 long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; 149 long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
@@ -190,6 +191,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
190/* 191/*
191 * wait until we successfully acquire the write lock 192 * wait until we successfully acquire the write lock
192 */ 193 */
194__visible
193struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) 195struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
194{ 196{
195 long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; 197 long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
@@ -252,6 +254,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
252 * handle waking up a waiter on the semaphore 254 * handle waking up a waiter on the semaphore
253 * - up_read/up_write has decremented the active part of count if we come here 255 * - up_read/up_write has decremented the active part of count if we come here
254 */ 256 */
257__visible
255struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) 258struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
256{ 259{
257 unsigned long flags; 260 unsigned long flags;
@@ -272,6 +275,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
272 * - caller incremented waiting part of count and discovered it still negative 275 * - caller incremented waiting part of count and discovered it still negative
273 * - just wake up any readers at the front of the queue 276 * - just wake up any readers at the front of the queue
274 */ 277 */
278__visible
275struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) 279struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
276{ 280{
277 unsigned long flags; 281 unsigned long flags;
diff --git a/kernel/module.c b/kernel/module.c
index d24fcf29cb64..29f7790eaa14 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1013,9 +1013,11 @@ static size_t module_flags_taint(struct module *mod, char *buf)
1013 buf[l++] = 'F'; 1013 buf[l++] = 'F';
1014 if (mod->taints & (1 << TAINT_CRAP)) 1014 if (mod->taints & (1 << TAINT_CRAP))
1015 buf[l++] = 'C'; 1015 buf[l++] = 'C';
1016 if (mod->taints & (1 << TAINT_UNSIGNED_MODULE))
1017 buf[l++] = 'E';
1016 /* 1018 /*
1017 * TAINT_FORCED_RMMOD: could be added. 1019 * TAINT_FORCED_RMMOD: could be added.
1018 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't 1020 * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
1019 * apply to modules. 1021 * apply to modules.
1020 */ 1022 */
1021 return l; 1023 return l;
@@ -1948,6 +1950,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1948 1950
1949 switch (sym[i].st_shndx) { 1951 switch (sym[i].st_shndx) {
1950 case SHN_COMMON: 1952 case SHN_COMMON:
1953 /* Ignore common symbols */
1954 if (!strncmp(name, "__gnu_lto", 9))
1955 break;
1956
1951 /* We compiled with -fno-common. These are not 1957 /* We compiled with -fno-common. These are not
1952 supposed to happen. */ 1958 supposed to happen. */
1953 pr_debug("Common symbol: %s\n", name); 1959 pr_debug("Common symbol: %s\n", name);
@@ -3214,7 +3220,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
3214 pr_notice_once("%s: module verification failed: signature " 3220 pr_notice_once("%s: module verification failed: signature "
3215 "and/or required key missing - tainting " 3221 "and/or required key missing - tainting "
3216 "kernel\n", mod->name); 3222 "kernel\n", mod->name);
3217 add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK); 3223 add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
3218 } 3224 }
3219#endif 3225#endif
3220 3226
@@ -3809,12 +3815,12 @@ void print_modules(void)
3809 list_for_each_entry_rcu(mod, &modules, list) { 3815 list_for_each_entry_rcu(mod, &modules, list) {
3810 if (mod->state == MODULE_STATE_UNFORMED) 3816 if (mod->state == MODULE_STATE_UNFORMED)
3811 continue; 3817 continue;
3812 printk(" %s%s", mod->name, module_flags(mod, buf)); 3818 pr_cont(" %s%s", mod->name, module_flags(mod, buf));
3813 } 3819 }
3814 preempt_enable(); 3820 preempt_enable();
3815 if (last_unloaded_module[0]) 3821 if (last_unloaded_module[0])
3816 printk(" [last unloaded: %s]", last_unloaded_module); 3822 pr_cont(" [last unloaded: %s]", last_unloaded_module);
3817 printk("\n"); 3823 pr_cont("\n");
3818} 3824}
3819 3825
3820#ifdef CONFIG_MODVERSIONS 3826#ifdef CONFIG_MODVERSIONS
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 2d5cc4ccff7f..db4c8b08a50c 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
309 * racy then it does not matter what the result of the test 309 * racy then it does not matter what the result of the test
310 * is, we re-check the list after having taken the lock anyway: 310 * is, we re-check the list after having taken the lock anyway:
311 */ 311 */
312 if (rcu_dereference_raw(nh->head)) { 312 if (rcu_access_pointer(nh->head)) {
313 down_read(&nh->rwsem); 313 down_read(&nh->rwsem);
314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, 314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
315 nr_calls); 315 nr_calls);
diff --git a/kernel/panic.c b/kernel/panic.c
index 6d6300375090..79fd820bb5e8 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -199,7 +199,7 @@ struct tnt {
199static const struct tnt tnts[] = { 199static const struct tnt tnts[] = {
200 { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, 200 { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
201 { TAINT_FORCED_MODULE, 'F', ' ' }, 201 { TAINT_FORCED_MODULE, 'F', ' ' },
202 { TAINT_UNSAFE_SMP, 'S', ' ' }, 202 { TAINT_CPU_OUT_OF_SPEC, 'S', ' ' },
203 { TAINT_FORCED_RMMOD, 'R', ' ' }, 203 { TAINT_FORCED_RMMOD, 'R', ' ' },
204 { TAINT_MACHINE_CHECK, 'M', ' ' }, 204 { TAINT_MACHINE_CHECK, 'M', ' ' },
205 { TAINT_BAD_PAGE, 'B', ' ' }, 205 { TAINT_BAD_PAGE, 'B', ' ' },
@@ -210,6 +210,7 @@ static const struct tnt tnts[] = {
210 { TAINT_CRAP, 'C', ' ' }, 210 { TAINT_CRAP, 'C', ' ' },
211 { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, 211 { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
212 { TAINT_OOT_MODULE, 'O', ' ' }, 212 { TAINT_OOT_MODULE, 'O', ' ' },
213 { TAINT_UNSIGNED_MODULE, 'E', ' ' },
213}; 214};
214 215
215/** 216/**
@@ -228,6 +229,7 @@ static const struct tnt tnts[] = {
228 * 'C' - modules from drivers/staging are loaded. 229 * 'C' - modules from drivers/staging are loaded.
229 * 'I' - Working around severe firmware bug. 230 * 'I' - Working around severe firmware bug.
230 * 'O' - Out-of-tree module has been loaded. 231 * 'O' - Out-of-tree module has been loaded.
232 * 'E' - Unsigned module has been loaded.
231 * 233 *
232 * The string is overwritten by the next call to print_tainted(). 234 * The string is overwritten by the next call to print_tainted().
233 */ 235 */
@@ -459,7 +461,7 @@ EXPORT_SYMBOL(warn_slowpath_null);
459 * Called when gcc's -fstack-protector feature is used, and 461 * Called when gcc's -fstack-protector feature is used, and
460 * gcc detects corruption of the on-stack canary value 462 * gcc detects corruption of the on-stack canary value
461 */ 463 */
462void __stack_chk_fail(void) 464__visible void __stack_chk_fail(void)
463{ 465{
464 panic("stack-protector: Kernel stack is corrupted in: %p\n", 466 panic("stack-protector: Kernel stack is corrupted in: %p\n",
465 __builtin_return_address(0)); 467 __builtin_return_address(0));
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 06c62de9c711..db95d8eb761b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -318,7 +318,9 @@ static void *pidns_get(struct task_struct *task)
318 struct pid_namespace *ns; 318 struct pid_namespace *ns;
319 319
320 rcu_read_lock(); 320 rcu_read_lock();
321 ns = get_pid_ns(task_active_pid_ns(task)); 321 ns = task_active_pid_ns(task);
322 if (ns)
323 get_pid_ns(ns);
322 rcu_read_unlock(); 324 rcu_read_unlock();
323 325
324 return ns; 326 return ns;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 37170d4dd9a6..f4f2073711d3 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -973,16 +973,20 @@ static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
973static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, 973static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
974 const char *buf, size_t n) 974 const char *buf, size_t n)
975{ 975{
976 unsigned int maj, min;
977 dev_t res; 976 dev_t res;
978 int ret = -EINVAL; 977 int len = n;
978 char *name;
979 979
980 if (sscanf(buf, "%u:%u", &maj, &min) != 2) 980 if (len && buf[len-1] == '\n')
981 goto out; 981 len--;
982 name = kstrndup(buf, len, GFP_KERNEL);
983 if (!name)
984 return -ENOMEM;
982 985
983 res = MKDEV(maj,min); 986 res = name_to_dev_t(name);
984 if (maj != MAJOR(res) || min != MINOR(res)) 987 kfree(name);
985 goto out; 988 if (!res)
989 return -EINVAL;
986 990
987 lock_system_sleep(); 991 lock_system_sleep();
988 swsusp_resume_device = res; 992 swsusp_resume_device = res;
@@ -990,9 +994,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
990 printk(KERN_INFO "PM: Starting manual resume from disk\n"); 994 printk(KERN_INFO "PM: Starting manual resume from disk\n");
991 noresume = 0; 995 noresume = 0;
992 software_resume(); 996 software_resume();
993 ret = n; 997 return n;
994 out:
995 return ret;
996} 998}
997 999
998power_attr(resume); 1000power_attr(resume);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1d1bf630e6e9..6271bc4073ef 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -282,8 +282,8 @@ struct kobject *power_kobj;
282 * state - control system power state. 282 * state - control system power state.
283 * 283 *
284 * show() returns what states are supported, which is hard-coded to 284 * show() returns what states are supported, which is hard-coded to
285 * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and 285 * 'freeze' (Low-Power Idle), 'standby' (Power-On Suspend),
286 * 'disk' (Suspend-to-Disk). 286 * 'mem' (Suspend-to-RAM), and 'disk' (Suspend-to-Disk).
287 * 287 *
288 * store() accepts one of those strings, translates it into the 288 * store() accepts one of those strings, translates it into the
289 * proper enumerated value, and initiates a suspend transition. 289 * proper enumerated value, and initiates a suspend transition.
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 7d4b7ffb3c1d..1ca753106557 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -49,6 +49,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
49 */ 49 */
50#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) 50#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
51 51
52asmlinkage int swsusp_save(void);
53
52/* kernel/power/hibernate.c */ 54/* kernel/power/hibernate.c */
53extern bool freezer_test_done; 55extern bool freezer_test_done;
54 56
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 8dff9b48075a..884b77058864 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -66,6 +66,7 @@ static struct pm_qos_constraints cpu_dma_constraints = {
66 .list = PLIST_HEAD_INIT(cpu_dma_constraints.list), 66 .list = PLIST_HEAD_INIT(cpu_dma_constraints.list),
67 .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, 67 .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
68 .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, 68 .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
69 .no_constraint_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
69 .type = PM_QOS_MIN, 70 .type = PM_QOS_MIN,
70 .notifiers = &cpu_dma_lat_notifier, 71 .notifiers = &cpu_dma_lat_notifier,
71}; 72};
@@ -79,6 +80,7 @@ static struct pm_qos_constraints network_lat_constraints = {
79 .list = PLIST_HEAD_INIT(network_lat_constraints.list), 80 .list = PLIST_HEAD_INIT(network_lat_constraints.list),
80 .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, 81 .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
81 .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, 82 .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
83 .no_constraint_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
82 .type = PM_QOS_MIN, 84 .type = PM_QOS_MIN,
83 .notifiers = &network_lat_notifier, 85 .notifiers = &network_lat_notifier,
84}; 86};
@@ -93,6 +95,7 @@ static struct pm_qos_constraints network_tput_constraints = {
93 .list = PLIST_HEAD_INIT(network_tput_constraints.list), 95 .list = PLIST_HEAD_INIT(network_tput_constraints.list),
94 .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, 96 .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
95 .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, 97 .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
98 .no_constraint_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
96 .type = PM_QOS_MAX, 99 .type = PM_QOS_MAX,
97 .notifiers = &network_throughput_notifier, 100 .notifiers = &network_throughput_notifier,
98}; 101};
@@ -128,7 +131,7 @@ static const struct file_operations pm_qos_power_fops = {
128static inline int pm_qos_get_value(struct pm_qos_constraints *c) 131static inline int pm_qos_get_value(struct pm_qos_constraints *c)
129{ 132{
130 if (plist_head_empty(&c->list)) 133 if (plist_head_empty(&c->list))
131 return c->default_value; 134 return c->no_constraint_value;
132 135
133 switch (c->type) { 136 switch (c->type) {
134 case PM_QOS_MIN: 137 case PM_QOS_MIN:
@@ -170,6 +173,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
170{ 173{
171 unsigned long flags; 174 unsigned long flags;
172 int prev_value, curr_value, new_value; 175 int prev_value, curr_value, new_value;
176 int ret;
173 177
174 spin_lock_irqsave(&pm_qos_lock, flags); 178 spin_lock_irqsave(&pm_qos_lock, flags);
175 prev_value = pm_qos_get_value(c); 179 prev_value = pm_qos_get_value(c);
@@ -205,13 +209,15 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
205 209
206 trace_pm_qos_update_target(action, prev_value, curr_value); 210 trace_pm_qos_update_target(action, prev_value, curr_value);
207 if (prev_value != curr_value) { 211 if (prev_value != curr_value) {
208 blocking_notifier_call_chain(c->notifiers, 212 ret = 1;
209 (unsigned long)curr_value, 213 if (c->notifiers)
210 NULL); 214 blocking_notifier_call_chain(c->notifiers,
211 return 1; 215 (unsigned long)curr_value,
216 NULL);
212 } else { 217 } else {
213 return 0; 218 ret = 0;
214 } 219 }
220 return ret;
215} 221}
216 222
217/** 223/**
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d9f61a145802..149e745eaa52 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1268,7 +1268,7 @@ static void free_unnecessary_pages(void)
1268 * [number of saveable pages] - [number of pages that can be freed in theory] 1268 * [number of saveable pages] - [number of pages that can be freed in theory]
1269 * 1269 *
1270 * where the second term is the sum of (1) reclaimable slab pages, (2) active 1270 * where the second term is the sum of (1) reclaimable slab pages, (2) active
1271 * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages, 1271 * and (3) inactive anonymous pages, (4) active and (5) inactive file pages,
1272 * minus mapped file pages. 1272 * minus mapped file pages.
1273 */ 1273 */
1274static unsigned long minimum_image_size(unsigned long saveable) 1274static unsigned long minimum_image_size(unsigned long saveable)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 62ee437b5c7e..90b3d9366d1a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -39,7 +39,7 @@ static const struct platform_suspend_ops *suspend_ops;
39 39
40static bool need_suspend_ops(suspend_state_t state) 40static bool need_suspend_ops(suspend_state_t state)
41{ 41{
42 return !!(state > PM_SUSPEND_FREEZE); 42 return state > PM_SUSPEND_FREEZE;
43} 43}
44 44
45static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); 45static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 8f50de394d22..019069c84ff6 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -18,6 +18,8 @@
18#include <linux/rbtree.h> 18#include <linux/rbtree.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20 20
21#include "power.h"
22
21static DEFINE_MUTEX(wakelocks_lock); 23static DEFINE_MUTEX(wakelocks_lock);
22 24
23struct wakelock { 25struct wakelock {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 4dae9cbe9259..a45b50962295 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -319,7 +319,7 @@ static void log_store(int facility, int level,
319 else 319 else
320 free = log_first_idx - log_next_idx; 320 free = log_first_idx - log_next_idx;
321 321
322 if (free > size + sizeof(struct printk_log)) 322 if (free >= size + sizeof(struct printk_log))
323 break; 323 break;
324 324
325 /* drop old messages until we have enough contiuous space */ 325 /* drop old messages until we have enough contiuous space */
@@ -327,7 +327,7 @@ static void log_store(int facility, int level,
327 log_first_seq++; 327 log_first_seq++;
328 } 328 }
329 329
330 if (log_next_idx + size + sizeof(struct printk_log) >= log_buf_len) { 330 if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) {
331 /* 331 /*
332 * This message + an additional empty header does not fit 332 * This message + an additional empty header does not fit
333 * at the end of the buffer. Add an empty header with len == 0 333 * at the end of the buffer. Add an empty header with len == 0
@@ -351,7 +351,7 @@ static void log_store(int facility, int level,
351 else 351 else
352 msg->ts_nsec = local_clock(); 352 msg->ts_nsec = local_clock();
353 memset(log_dict(msg) + dict_len, 0, pad_len); 353 memset(log_dict(msg) + dict_len, 0, pad_len);
354 msg->len = sizeof(struct printk_log) + text_len + dict_len + pad_len; 354 msg->len = size;
355 355
356 /* insert message */ 356 /* insert message */
357 log_next_idx += msg->len; 357 log_next_idx += msg->len;
@@ -1560,9 +1560,12 @@ asmlinkage int vprintk_emit(int facility, int level,
1560 level = kern_level - '0'; 1560 level = kern_level - '0';
1561 case 'd': /* KERN_DEFAULT */ 1561 case 'd': /* KERN_DEFAULT */
1562 lflags |= LOG_PREFIX; 1562 lflags |= LOG_PREFIX;
1563 case 'c': /* KERN_CONT */
1564 break;
1565 } 1563 }
1564 /*
1565 * No need to check length here because vscnprintf
1566 * put '\0' at the end of the string. Only valid and
1567 * newly printed level is detected.
1568 */
1566 text_len -= end_of_header - text; 1569 text_len -= end_of_header - text;
1567 text = (char *)end_of_header; 1570 text = (char *)end_of_header;
1568 } 1571 }
@@ -1880,6 +1883,7 @@ void suspend_console(void)
1880 console_lock(); 1883 console_lock();
1881 console_suspended = 1; 1884 console_suspended = 1;
1882 up(&console_sem); 1885 up(&console_sem);
1886 mutex_release(&console_lock_dep_map, 1, _RET_IP_);
1883} 1887}
1884 1888
1885void resume_console(void) 1889void resume_console(void)
@@ -1887,6 +1891,7 @@ void resume_console(void)
1887 if (!console_suspend_enabled) 1891 if (!console_suspend_enabled)
1888 return; 1892 return;
1889 down(&console_sem); 1893 down(&console_sem);
1894 mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
1890 console_suspended = 0; 1895 console_suspended = 0;
1891 console_unlock(); 1896 console_unlock();
1892} 1897}
diff --git a/kernel/profile.c b/kernel/profile.c
index 93b2a3fe0a64..cb980f0c731b 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -614,5 +614,5 @@ out:
614 cpu_notifier_register_done(); 614 cpu_notifier_register_done();
615 return err; 615 return err;
616} 616}
617module_init(create_proc_profile); 617subsys_initcall(create_proc_profile);
618#endif /* CONFIG_PROC_FS */ 618#endif /* CONFIG_PROC_FS */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1f4bcb3cc21c..adf98622cb32 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1180,8 +1180,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
1180 return ret; 1180 return ret;
1181} 1181}
1182 1182
1183asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, 1183COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid,
1184 compat_long_t addr, compat_long_t data) 1184 compat_long_t, addr, compat_long_t, data)
1185{ 1185{
1186 struct task_struct *child; 1186 struct task_struct *child;
1187 long ret; 1187 long ret;
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 01e9ec37a3e3..807ccfbf69b3 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,5 +1,5 @@
1obj-y += update.o srcu.o 1obj-y += update.o srcu.o
2obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o 2obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
3obj-$(CONFIG_TREE_RCU) += tree.o 3obj-$(CONFIG_TREE_RCU) += tree.o
4obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o 4obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o
5obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o 5obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 79c3877e9c5b..bfda2726ca45 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -12,8 +12,8 @@
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, you can access it online at
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 * 17 *
18 * Copyright IBM Corporation, 2011 18 * Copyright IBM Corporation, 2011
19 * 19 *
@@ -23,6 +23,7 @@
23#ifndef __LINUX_RCU_H 23#ifndef __LINUX_RCU_H
24#define __LINUX_RCU_H 24#define __LINUX_RCU_H
25 25
26#include <trace/events/rcu.h>
26#ifdef CONFIG_RCU_TRACE 27#ifdef CONFIG_RCU_TRACE
27#define RCU_TRACE(stmt) stmt 28#define RCU_TRACE(stmt) stmt
28#else /* #ifdef CONFIG_RCU_TRACE */ 29#else /* #ifdef CONFIG_RCU_TRACE */
@@ -116,8 +117,6 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
116 } 117 }
117} 118}
118 119
119extern int rcu_expedited;
120
121#ifdef CONFIG_RCU_STALL_COMMON 120#ifdef CONFIG_RCU_STALL_COMMON
122 121
123extern int rcu_cpu_stall_suppress; 122extern int rcu_cpu_stall_suppress;
diff --git a/kernel/rcu/torture.c b/kernel/rcu/rcutorture.c
index 732f8ae3086a..bd30bc61bc05 100644
--- a/kernel/rcu/torture.c
+++ b/kernel/rcu/rcutorture.c
@@ -12,8 +12,8 @@
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, you can access it online at
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 * 17 *
18 * Copyright (C) IBM Corporation, 2005, 2006 18 * Copyright (C) IBM Corporation, 2005, 2006
19 * 19 *
@@ -48,110 +48,58 @@
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <linux/trace_clock.h> 49#include <linux/trace_clock.h>
50#include <asm/byteorder.h> 50#include <asm/byteorder.h>
51#include <linux/torture.h>
51 52
52MODULE_LICENSE("GPL"); 53MODULE_LICENSE("GPL");
53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); 54MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
54 55
55MODULE_ALIAS("rcutorture"); 56
56#ifdef MODULE_PARAM_PREFIX 57torture_param(int, fqs_duration, 0,
57#undef MODULE_PARAM_PREFIX 58 "Duration of fqs bursts (us), 0 to disable");
58#endif 59torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
59#define MODULE_PARAM_PREFIX "rcutorture." 60torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
60 61torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
61static int fqs_duration; 62torture_param(bool, gp_normal, false,
62module_param(fqs_duration, int, 0444); 63 "Use normal (non-expedited) GP wait primitives");
63MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); 64torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
64static int fqs_holdoff; 65torture_param(int, n_barrier_cbs, 0,
65module_param(fqs_holdoff, int, 0444); 66 "# of callbacks/kthreads for barrier testing");
66MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 67torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
67static int fqs_stutter = 3; 68torture_param(int, nreaders, -1, "Number of RCU reader threads");
68module_param(fqs_stutter, int, 0444); 69torture_param(int, object_debug, 0,
69MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 70 "Enable debug-object double call_rcu() testing");
70static bool gp_exp; 71torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
71module_param(gp_exp, bool, 0444); 72torture_param(int, onoff_interval, 0,
72MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); 73 "Time between CPU hotplugs (s), 0=disable");
73static bool gp_normal; 74torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
74module_param(gp_normal, bool, 0444); 75torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
75MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); 76torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
76static int irqreader = 1; 77torture_param(int, stall_cpu_holdoff, 10,
77module_param(irqreader, int, 0444); 78 "Time to wait before starting stall (s).");
78MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); 79torture_param(int, stat_interval, 60,
79static int n_barrier_cbs; 80 "Number of seconds between stats printk()s");
80module_param(n_barrier_cbs, int, 0444); 81torture_param(int, stutter, 5, "Number of seconds to run/halt test");
81MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); 82torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
82static int nfakewriters = 4; 83torture_param(int, test_boost_duration, 4,
83module_param(nfakewriters, int, 0444); 84 "Duration of each boost test, seconds.");
84MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); 85torture_param(int, test_boost_interval, 7,
85static int nreaders = -1; 86 "Interval between boost tests, seconds.");
86module_param(nreaders, int, 0444); 87torture_param(bool, test_no_idle_hz, true,
87MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 88 "Test support for tickless idle CPUs");
88static int object_debug; 89torture_param(bool, verbose, true,
89module_param(object_debug, int, 0444); 90 "Enable verbose debugging printk()s");
90MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); 91
91static int onoff_holdoff;
92module_param(onoff_holdoff, int, 0444);
93MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
94static int onoff_interval;
95module_param(onoff_interval, int, 0444);
96MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
97static int shuffle_interval = 3;
98module_param(shuffle_interval, int, 0444);
99MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
100static int shutdown_secs;
101module_param(shutdown_secs, int, 0444);
102MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable.");
103static int stall_cpu;
104module_param(stall_cpu, int, 0444);
105MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
106static int stall_cpu_holdoff = 10;
107module_param(stall_cpu_holdoff, int, 0444);
108MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
109static int stat_interval = 60;
110module_param(stat_interval, int, 0644);
111MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
112static int stutter = 5;
113module_param(stutter, int, 0444);
114MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
115static int test_boost = 1;
116module_param(test_boost, int, 0444);
117MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
118static int test_boost_duration = 4;
119module_param(test_boost_duration, int, 0444);
120MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
121static int test_boost_interval = 7;
122module_param(test_boost_interval, int, 0444);
123MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
124static bool test_no_idle_hz = true;
125module_param(test_no_idle_hz, bool, 0444);
126MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
127static char *torture_type = "rcu"; 92static char *torture_type = "rcu";
128module_param(torture_type, charp, 0444); 93module_param(torture_type, charp, 0444);
129MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); 94MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
130static bool verbose;
131module_param(verbose, bool, 0444);
132MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
133
134#define TORTURE_FLAG "-torture:"
135#define PRINTK_STRING(s) \
136 do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
137#define VERBOSE_PRINTK_STRING(s) \
138 do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
139#define VERBOSE_PRINTK_ERRSTRING(s) \
140 do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
141 95
142static int nrealreaders; 96static int nrealreaders;
143static struct task_struct *writer_task; 97static struct task_struct *writer_task;
144static struct task_struct **fakewriter_tasks; 98static struct task_struct **fakewriter_tasks;
145static struct task_struct **reader_tasks; 99static struct task_struct **reader_tasks;
146static struct task_struct *stats_task; 100static struct task_struct *stats_task;
147static struct task_struct *shuffler_task;
148static struct task_struct *stutter_task;
149static struct task_struct *fqs_task; 101static struct task_struct *fqs_task;
150static struct task_struct *boost_tasks[NR_CPUS]; 102static struct task_struct *boost_tasks[NR_CPUS];
151static struct task_struct *shutdown_task;
152#ifdef CONFIG_HOTPLUG_CPU
153static struct task_struct *onoff_task;
154#endif /* #ifdef CONFIG_HOTPLUG_CPU */
155static struct task_struct *stall_task; 103static struct task_struct *stall_task;
156static struct task_struct **barrier_cbs_tasks; 104static struct task_struct **barrier_cbs_tasks;
157static struct task_struct *barrier_task; 105static struct task_struct *barrier_task;
@@ -170,10 +118,10 @@ static struct rcu_torture __rcu *rcu_torture_current;
170static unsigned long rcu_torture_current_version; 118static unsigned long rcu_torture_current_version;
171static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 119static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
172static DEFINE_SPINLOCK(rcu_torture_lock); 120static DEFINE_SPINLOCK(rcu_torture_lock);
173static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = 121static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
174 { 0 }; 122 rcu_torture_count) = { 0 };
175static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = 123static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
176 { 0 }; 124 rcu_torture_batch) = { 0 };
177static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; 125static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
178static atomic_t n_rcu_torture_alloc; 126static atomic_t n_rcu_torture_alloc;
179static atomic_t n_rcu_torture_alloc_fail; 127static atomic_t n_rcu_torture_alloc_fail;
@@ -186,22 +134,9 @@ static long n_rcu_torture_boost_rterror;
186static long n_rcu_torture_boost_failure; 134static long n_rcu_torture_boost_failure;
187static long n_rcu_torture_boosts; 135static long n_rcu_torture_boosts;
188static long n_rcu_torture_timers; 136static long n_rcu_torture_timers;
189static long n_offline_attempts;
190static long n_offline_successes;
191static unsigned long sum_offline;
192static int min_offline = -1;
193static int max_offline;
194static long n_online_attempts;
195static long n_online_successes;
196static unsigned long sum_online;
197static int min_online = -1;
198static int max_online;
199static long n_barrier_attempts; 137static long n_barrier_attempts;
200static long n_barrier_successes; 138static long n_barrier_successes;
201static struct list_head rcu_torture_removed; 139static struct list_head rcu_torture_removed;
202static cpumask_var_t shuffle_tmp_mask;
203
204static int stutter_pause_test;
205 140
206#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) 141#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
207#define RCUTORTURE_RUNNABLE_INIT 1 142#define RCUTORTURE_RUNNABLE_INIT 1
@@ -232,7 +167,6 @@ static u64 notrace rcu_trace_clock_local(void)
232} 167}
233#endif /* #else #ifdef CONFIG_RCU_TRACE */ 168#endif /* #else #ifdef CONFIG_RCU_TRACE */
234 169
235static unsigned long shutdown_time; /* jiffies to system shutdown. */
236static unsigned long boost_starttime; /* jiffies of next boost test start. */ 170static unsigned long boost_starttime; /* jiffies of next boost test start. */
237DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 171DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
238 /* and boost task create/destroy. */ 172 /* and boost task create/destroy. */
@@ -242,51 +176,6 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
242static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ 176static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
243static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); 177static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
244 178
245/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
246
247#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
248#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */
249#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */
250static int fullstop = FULLSTOP_RMMOD;
251/*
252 * Protect fullstop transitions and spawning of kthreads.
253 */
254static DEFINE_MUTEX(fullstop_mutex);
255
256/* Forward reference. */
257static void rcu_torture_cleanup(void);
258
259/*
260 * Detect and respond to a system shutdown.
261 */
262static int
263rcutorture_shutdown_notify(struct notifier_block *unused1,
264 unsigned long unused2, void *unused3)
265{
266 mutex_lock(&fullstop_mutex);
267 if (fullstop == FULLSTOP_DONTSTOP)
268 fullstop = FULLSTOP_SHUTDOWN;
269 else
270 pr_warn(/* but going down anyway, so... */
271 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
272 mutex_unlock(&fullstop_mutex);
273 return NOTIFY_DONE;
274}
275
276/*
277 * Absorb kthreads into a kernel function that won't return, so that
278 * they won't ever access module text or data again.
279 */
280static void rcutorture_shutdown_absorb(const char *title)
281{
282 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
283 pr_notice(
284 "rcutorture thread %s parking due to system shutdown\n",
285 title);
286 schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
287 }
288}
289
290/* 179/*
291 * Allocate an element from the rcu_tortures pool. 180 * Allocate an element from the rcu_tortures pool.
292 */ 181 */
@@ -320,44 +209,6 @@ rcu_torture_free(struct rcu_torture *p)
320 spin_unlock_bh(&rcu_torture_lock); 209 spin_unlock_bh(&rcu_torture_lock);
321} 210}
322 211
323struct rcu_random_state {
324 unsigned long rrs_state;
325 long rrs_count;
326};
327
328#define RCU_RANDOM_MULT 39916801 /* prime */
329#define RCU_RANDOM_ADD 479001701 /* prime */
330#define RCU_RANDOM_REFRESH 10000
331
332#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
333
334/*
335 * Crude but fast random-number generator. Uses a linear congruential
336 * generator, with occasional help from cpu_clock().
337 */
338static unsigned long
339rcu_random(struct rcu_random_state *rrsp)
340{
341 if (--rrsp->rrs_count < 0) {
342 rrsp->rrs_state += (unsigned long)local_clock();
343 rrsp->rrs_count = RCU_RANDOM_REFRESH;
344 }
345 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
346 return swahw32(rrsp->rrs_state);
347}
348
349static void
350rcu_stutter_wait(const char *title)
351{
352 while (stutter_pause_test || !rcutorture_runnable) {
353 if (rcutorture_runnable)
354 schedule_timeout_interruptible(1);
355 else
356 schedule_timeout_interruptible(round_jiffies_relative(HZ));
357 rcutorture_shutdown_absorb(title);
358 }
359}
360
361/* 212/*
362 * Operations vector for selecting different types of tests. 213 * Operations vector for selecting different types of tests.
363 */ 214 */
@@ -365,7 +216,7 @@ rcu_stutter_wait(const char *title)
365struct rcu_torture_ops { 216struct rcu_torture_ops {
366 void (*init)(void); 217 void (*init)(void);
367 int (*readlock)(void); 218 int (*readlock)(void);
368 void (*read_delay)(struct rcu_random_state *rrsp); 219 void (*read_delay)(struct torture_random_state *rrsp);
369 void (*readunlock)(int idx); 220 void (*readunlock)(int idx);
370 int (*completed)(void); 221 int (*completed)(void);
371 void (*deferred_free)(struct rcu_torture *p); 222 void (*deferred_free)(struct rcu_torture *p);
@@ -392,7 +243,7 @@ static int rcu_torture_read_lock(void) __acquires(RCU)
392 return 0; 243 return 0;
393} 244}
394 245
395static void rcu_read_delay(struct rcu_random_state *rrsp) 246static void rcu_read_delay(struct torture_random_state *rrsp)
396{ 247{
397 const unsigned long shortdelay_us = 200; 248 const unsigned long shortdelay_us = 200;
398 const unsigned long longdelay_ms = 50; 249 const unsigned long longdelay_ms = 50;
@@ -401,12 +252,13 @@ static void rcu_read_delay(struct rcu_random_state *rrsp)
401 * period, and we want a long delay occasionally to trigger 252 * period, and we want a long delay occasionally to trigger
402 * force_quiescent_state. */ 253 * force_quiescent_state. */
403 254
404 if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) 255 if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
405 mdelay(longdelay_ms); 256 mdelay(longdelay_ms);
406 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) 257 if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
407 udelay(shortdelay_us); 258 udelay(shortdelay_us);
408#ifdef CONFIG_PREEMPT 259#ifdef CONFIG_PREEMPT
409 if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) 260 if (!preempt_count() &&
261 !(torture_random(rrsp) % (nrealreaders * 20000)))
410 preempt_schedule(); /* No QS if preempt_disable() in effect */ 262 preempt_schedule(); /* No QS if preempt_disable() in effect */
411#endif 263#endif
412} 264}
@@ -427,7 +279,7 @@ rcu_torture_cb(struct rcu_head *p)
427 int i; 279 int i;
428 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); 280 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
429 281
430 if (fullstop != FULLSTOP_DONTSTOP) { 282 if (torture_must_stop_irq()) {
431 /* Test is ending, just drop callbacks on the floor. */ 283 /* Test is ending, just drop callbacks on the floor. */
432 /* The next initialization will pick up the pieces. */ 284 /* The next initialization will pick up the pieces. */
433 return; 285 return;
@@ -520,6 +372,48 @@ static struct rcu_torture_ops rcu_bh_ops = {
520}; 372};
521 373
522/* 374/*
375 * Don't even think about trying any of these in real life!!!
376 * The names includes "busted", and they really means it!
377 * The only purpose of these functions is to provide a buggy RCU
378 * implementation to make sure that rcutorture correctly emits
379 * buggy-RCU error messages.
380 */
381static void rcu_busted_torture_deferred_free(struct rcu_torture *p)
382{
383 /* This is a deliberate bug for testing purposes only! */
384 rcu_torture_cb(&p->rtort_rcu);
385}
386
387static void synchronize_rcu_busted(void)
388{
389 /* This is a deliberate bug for testing purposes only! */
390}
391
392static void
393call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
394{
395 /* This is a deliberate bug for testing purposes only! */
396 func(head);
397}
398
399static struct rcu_torture_ops rcu_busted_ops = {
400 .init = rcu_sync_torture_init,
401 .readlock = rcu_torture_read_lock,
402 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
403 .readunlock = rcu_torture_read_unlock,
404 .completed = rcu_no_completed,
405 .deferred_free = rcu_busted_torture_deferred_free,
406 .sync = synchronize_rcu_busted,
407 .exp_sync = synchronize_rcu_busted,
408 .call = call_rcu_busted,
409 .cb_barrier = NULL,
410 .fqs = NULL,
411 .stats = NULL,
412 .irq_capable = 1,
413 .name = "rcu_busted"
414};
415
416/*
523 * Definitions for srcu torture testing. 417 * Definitions for srcu torture testing.
524 */ 418 */
525 419
@@ -530,7 +424,7 @@ static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
530 return srcu_read_lock(&srcu_ctl); 424 return srcu_read_lock(&srcu_ctl);
531} 425}
532 426
533static void srcu_read_delay(struct rcu_random_state *rrsp) 427static void srcu_read_delay(struct torture_random_state *rrsp)
534{ 428{
535 long delay; 429 long delay;
536 const long uspertick = 1000000 / HZ; 430 const long uspertick = 1000000 / HZ;
@@ -538,7 +432,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
538 432
539 /* We want there to be long-running readers, but not all the time. */ 433 /* We want there to be long-running readers, but not all the time. */
540 434
541 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); 435 delay = torture_random(rrsp) %
436 (nrealreaders * 2 * longdelay * uspertick);
542 if (!delay) 437 if (!delay)
543 schedule_timeout_interruptible(longdelay); 438 schedule_timeout_interruptible(longdelay);
544 else 439 else
@@ -677,12 +572,12 @@ static int rcu_torture_boost(void *arg)
677 struct rcu_boost_inflight rbi = { .inflight = 0 }; 572 struct rcu_boost_inflight rbi = { .inflight = 0 };
678 struct sched_param sp; 573 struct sched_param sp;
679 574
680 VERBOSE_PRINTK_STRING("rcu_torture_boost started"); 575 VERBOSE_TOROUT_STRING("rcu_torture_boost started");
681 576
682 /* Set real-time priority. */ 577 /* Set real-time priority. */
683 sp.sched_priority = 1; 578 sp.sched_priority = 1;
684 if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { 579 if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
685 VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); 580 VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!");
686 n_rcu_torture_boost_rterror++; 581 n_rcu_torture_boost_rterror++;
687 } 582 }
688 583
@@ -693,9 +588,8 @@ static int rcu_torture_boost(void *arg)
693 oldstarttime = boost_starttime; 588 oldstarttime = boost_starttime;
694 while (ULONG_CMP_LT(jiffies, oldstarttime)) { 589 while (ULONG_CMP_LT(jiffies, oldstarttime)) {
695 schedule_timeout_interruptible(oldstarttime - jiffies); 590 schedule_timeout_interruptible(oldstarttime - jiffies);
696 rcu_stutter_wait("rcu_torture_boost"); 591 stutter_wait("rcu_torture_boost");
697 if (kthread_should_stop() || 592 if (torture_must_stop())
698 fullstop != FULLSTOP_DONTSTOP)
699 goto checkwait; 593 goto checkwait;
700 } 594 }
701 595
@@ -710,15 +604,14 @@ static int rcu_torture_boost(void *arg)
710 call_rcu(&rbi.rcu, rcu_torture_boost_cb); 604 call_rcu(&rbi.rcu, rcu_torture_boost_cb);
711 if (jiffies - call_rcu_time > 605 if (jiffies - call_rcu_time >
712 test_boost_duration * HZ - HZ / 2) { 606 test_boost_duration * HZ - HZ / 2) {
713 VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); 607 VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
714 n_rcu_torture_boost_failure++; 608 n_rcu_torture_boost_failure++;
715 } 609 }
716 call_rcu_time = jiffies; 610 call_rcu_time = jiffies;
717 } 611 }
718 cond_resched(); 612 cond_resched();
719 rcu_stutter_wait("rcu_torture_boost"); 613 stutter_wait("rcu_torture_boost");
720 if (kthread_should_stop() || 614 if (torture_must_stop())
721 fullstop != FULLSTOP_DONTSTOP)
722 goto checkwait; 615 goto checkwait;
723 } 616 }
724 617
@@ -742,16 +635,17 @@ static int rcu_torture_boost(void *arg)
742 } 635 }
743 636
744 /* Go do the stutter. */ 637 /* Go do the stutter. */
745checkwait: rcu_stutter_wait("rcu_torture_boost"); 638checkwait: stutter_wait("rcu_torture_boost");
746 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 639 } while (!torture_must_stop());
747 640
748 /* Clean up and exit. */ 641 /* Clean up and exit. */
749 VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); 642 while (!kthread_should_stop() || rbi.inflight) {
750 rcutorture_shutdown_absorb("rcu_torture_boost"); 643 torture_shutdown_absorb("rcu_torture_boost");
751 while (!kthread_should_stop() || rbi.inflight)
752 schedule_timeout_uninterruptible(1); 644 schedule_timeout_uninterruptible(1);
645 }
753 smp_mb(); /* order accesses to ->inflight before stack-frame death. */ 646 smp_mb(); /* order accesses to ->inflight before stack-frame death. */
754 destroy_rcu_head_on_stack(&rbi.rcu); 647 destroy_rcu_head_on_stack(&rbi.rcu);
648 torture_kthread_stopping("rcu_torture_boost");
755 return 0; 649 return 0;
756} 650}
757 651
@@ -766,7 +660,7 @@ rcu_torture_fqs(void *arg)
766 unsigned long fqs_resume_time; 660 unsigned long fqs_resume_time;
767 int fqs_burst_remaining; 661 int fqs_burst_remaining;
768 662
769 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); 663 VERBOSE_TOROUT_STRING("rcu_torture_fqs task started");
770 do { 664 do {
771 fqs_resume_time = jiffies + fqs_stutter * HZ; 665 fqs_resume_time = jiffies + fqs_stutter * HZ;
772 while (ULONG_CMP_LT(jiffies, fqs_resume_time) && 666 while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
@@ -780,12 +674,9 @@ rcu_torture_fqs(void *arg)
780 udelay(fqs_holdoff); 674 udelay(fqs_holdoff);
781 fqs_burst_remaining -= fqs_holdoff; 675 fqs_burst_remaining -= fqs_holdoff;
782 } 676 }
783 rcu_stutter_wait("rcu_torture_fqs"); 677 stutter_wait("rcu_torture_fqs");
784 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 678 } while (!torture_must_stop());
785 VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); 679 torture_kthread_stopping("rcu_torture_fqs");
786 rcutorture_shutdown_absorb("rcu_torture_fqs");
787 while (!kthread_should_stop())
788 schedule_timeout_uninterruptible(1);
789 return 0; 680 return 0;
790} 681}
791 682
@@ -802,10 +693,10 @@ rcu_torture_writer(void *arg)
802 struct rcu_torture *rp; 693 struct rcu_torture *rp;
803 struct rcu_torture *rp1; 694 struct rcu_torture *rp1;
804 struct rcu_torture *old_rp; 695 struct rcu_torture *old_rp;
805 static DEFINE_RCU_RANDOM(rand); 696 static DEFINE_TORTURE_RANDOM(rand);
806 697
807 VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); 698 VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
808 set_user_nice(current, 19); 699 set_user_nice(current, MAX_NICE);
809 700
810 do { 701 do {
811 schedule_timeout_uninterruptible(1); 702 schedule_timeout_uninterruptible(1);
@@ -813,7 +704,7 @@ rcu_torture_writer(void *arg)
813 if (rp == NULL) 704 if (rp == NULL)
814 continue; 705 continue;
815 rp->rtort_pipe_count = 0; 706 rp->rtort_pipe_count = 0;
816 udelay(rcu_random(&rand) & 0x3ff); 707 udelay(torture_random(&rand) & 0x3ff);
817 old_rp = rcu_dereference_check(rcu_torture_current, 708 old_rp = rcu_dereference_check(rcu_torture_current,
818 current == writer_task); 709 current == writer_task);
819 rp->rtort_mbtest = 1; 710 rp->rtort_mbtest = 1;
@@ -826,7 +717,7 @@ rcu_torture_writer(void *arg)
826 atomic_inc(&rcu_torture_wcount[i]); 717 atomic_inc(&rcu_torture_wcount[i]);
827 old_rp->rtort_pipe_count++; 718 old_rp->rtort_pipe_count++;
828 if (gp_normal == gp_exp) 719 if (gp_normal == gp_exp)
829 exp = !!(rcu_random(&rand) & 0x80); 720 exp = !!(torture_random(&rand) & 0x80);
830 else 721 else
831 exp = gp_exp; 722 exp = gp_exp;
832 if (!exp) { 723 if (!exp) {
@@ -852,12 +743,9 @@ rcu_torture_writer(void *arg)
852 } 743 }
853 } 744 }
854 rcutorture_record_progress(++rcu_torture_current_version); 745 rcutorture_record_progress(++rcu_torture_current_version);
855 rcu_stutter_wait("rcu_torture_writer"); 746 stutter_wait("rcu_torture_writer");
856 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 747 } while (!torture_must_stop());
857 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 748 torture_kthread_stopping("rcu_torture_writer");
858 rcutorture_shutdown_absorb("rcu_torture_writer");
859 while (!kthread_should_stop())
860 schedule_timeout_uninterruptible(1);
861 return 0; 749 return 0;
862} 750}
863 751
@@ -868,19 +756,19 @@ rcu_torture_writer(void *arg)
868static int 756static int
869rcu_torture_fakewriter(void *arg) 757rcu_torture_fakewriter(void *arg)
870{ 758{
871 DEFINE_RCU_RANDOM(rand); 759 DEFINE_TORTURE_RANDOM(rand);
872 760
873 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); 761 VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
874 set_user_nice(current, 19); 762 set_user_nice(current, MAX_NICE);
875 763
876 do { 764 do {
877 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); 765 schedule_timeout_uninterruptible(1 + torture_random(&rand)%10);
878 udelay(rcu_random(&rand) & 0x3ff); 766 udelay(torture_random(&rand) & 0x3ff);
879 if (cur_ops->cb_barrier != NULL && 767 if (cur_ops->cb_barrier != NULL &&
880 rcu_random(&rand) % (nfakewriters * 8) == 0) { 768 torture_random(&rand) % (nfakewriters * 8) == 0) {
881 cur_ops->cb_barrier(); 769 cur_ops->cb_barrier();
882 } else if (gp_normal == gp_exp) { 770 } else if (gp_normal == gp_exp) {
883 if (rcu_random(&rand) & 0x80) 771 if (torture_random(&rand) & 0x80)
884 cur_ops->sync(); 772 cur_ops->sync();
885 else 773 else
886 cur_ops->exp_sync(); 774 cur_ops->exp_sync();
@@ -889,13 +777,10 @@ rcu_torture_fakewriter(void *arg)
889 } else { 777 } else {
890 cur_ops->exp_sync(); 778 cur_ops->exp_sync();
891 } 779 }
892 rcu_stutter_wait("rcu_torture_fakewriter"); 780 stutter_wait("rcu_torture_fakewriter");
893 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 781 } while (!torture_must_stop());
894 782
895 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); 783 torture_kthread_stopping("rcu_torture_fakewriter");
896 rcutorture_shutdown_absorb("rcu_torture_fakewriter");
897 while (!kthread_should_stop())
898 schedule_timeout_uninterruptible(1);
899 return 0; 784 return 0;
900} 785}
901 786
@@ -921,7 +806,7 @@ static void rcu_torture_timer(unsigned long unused)
921 int idx; 806 int idx;
922 int completed; 807 int completed;
923 int completed_end; 808 int completed_end;
924 static DEFINE_RCU_RANDOM(rand); 809 static DEFINE_TORTURE_RANDOM(rand);
925 static DEFINE_SPINLOCK(rand_lock); 810 static DEFINE_SPINLOCK(rand_lock);
926 struct rcu_torture *p; 811 struct rcu_torture *p;
927 int pipe_count; 812 int pipe_count;
@@ -980,14 +865,14 @@ rcu_torture_reader(void *arg)
980 int completed; 865 int completed;
981 int completed_end; 866 int completed_end;
982 int idx; 867 int idx;
983 DEFINE_RCU_RANDOM(rand); 868 DEFINE_TORTURE_RANDOM(rand);
984 struct rcu_torture *p; 869 struct rcu_torture *p;
985 int pipe_count; 870 int pipe_count;
986 struct timer_list t; 871 struct timer_list t;
987 unsigned long long ts; 872 unsigned long long ts;
988 873
989 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 874 VERBOSE_TOROUT_STRING("rcu_torture_reader task started");
990 set_user_nice(current, 19); 875 set_user_nice(current, MAX_NICE);
991 if (irqreader && cur_ops->irq_capable) 876 if (irqreader && cur_ops->irq_capable)
992 setup_timer_on_stack(&t, rcu_torture_timer, 0); 877 setup_timer_on_stack(&t, rcu_torture_timer, 0);
993 878
@@ -1034,14 +919,11 @@ rcu_torture_reader(void *arg)
1034 preempt_enable(); 919 preempt_enable();
1035 cur_ops->readunlock(idx); 920 cur_ops->readunlock(idx);
1036 schedule(); 921 schedule();
1037 rcu_stutter_wait("rcu_torture_reader"); 922 stutter_wait("rcu_torture_reader");
1038 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 923 } while (!torture_must_stop());
1039 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
1040 rcutorture_shutdown_absorb("rcu_torture_reader");
1041 if (irqreader && cur_ops->irq_capable) 924 if (irqreader && cur_ops->irq_capable)
1042 del_timer_sync(&t); 925 del_timer_sync(&t);
1043 while (!kthread_should_stop()) 926 torture_kthread_stopping("rcu_torture_reader");
1044 schedule_timeout_uninterruptible(1);
1045 return 0; 927 return 0;
1046} 928}
1047 929
@@ -1083,13 +965,7 @@ rcu_torture_printk(char *page)
1083 n_rcu_torture_boost_failure, 965 n_rcu_torture_boost_failure,
1084 n_rcu_torture_boosts, 966 n_rcu_torture_boosts,
1085 n_rcu_torture_timers); 967 n_rcu_torture_timers);
1086 page += sprintf(page, 968 page = torture_onoff_stats(page);
1087 "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
1088 n_online_successes, n_online_attempts,
1089 n_offline_successes, n_offline_attempts,
1090 min_online, max_online,
1091 min_offline, max_offline,
1092 sum_online, sum_offline, HZ);
1093 page += sprintf(page, "barrier: %ld/%ld:%ld", 969 page += sprintf(page, "barrier: %ld/%ld:%ld",
1094 n_barrier_successes, 970 n_barrier_successes,
1095 n_barrier_attempts, 971 n_barrier_attempts,
@@ -1150,123 +1026,17 @@ rcu_torture_stats_print(void)
1150/* 1026/*
1151 * Periodically prints torture statistics, if periodic statistics printing 1027 * Periodically prints torture statistics, if periodic statistics printing
1152 * was specified via the stat_interval module parameter. 1028 * was specified via the stat_interval module parameter.
1153 *
1154 * No need to worry about fullstop here, since this one doesn't reference
1155 * volatile state or register callbacks.
1156 */ 1029 */
1157static int 1030static int
1158rcu_torture_stats(void *arg) 1031rcu_torture_stats(void *arg)
1159{ 1032{
1160 VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); 1033 VERBOSE_TOROUT_STRING("rcu_torture_stats task started");
1161 do { 1034 do {
1162 schedule_timeout_interruptible(stat_interval * HZ); 1035 schedule_timeout_interruptible(stat_interval * HZ);
1163 rcu_torture_stats_print(); 1036 rcu_torture_stats_print();
1164 rcutorture_shutdown_absorb("rcu_torture_stats"); 1037 torture_shutdown_absorb("rcu_torture_stats");
1165 } while (!kthread_should_stop()); 1038 } while (!torture_must_stop());
1166 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); 1039 torture_kthread_stopping("rcu_torture_stats");
1167 return 0;
1168}
1169
1170static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
1171
1172/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case
1173 * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs.
1174 */
1175static void rcu_torture_shuffle_tasks(void)
1176{
1177 int i;
1178
1179 cpumask_setall(shuffle_tmp_mask);
1180 get_online_cpus();
1181
1182 /* No point in shuffling if there is only one online CPU (ex: UP) */
1183 if (num_online_cpus() == 1) {
1184 put_online_cpus();
1185 return;
1186 }
1187
1188 if (rcu_idle_cpu != -1)
1189 cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask);
1190
1191 set_cpus_allowed_ptr(current, shuffle_tmp_mask);
1192
1193 if (reader_tasks) {
1194 for (i = 0; i < nrealreaders; i++)
1195 if (reader_tasks[i])
1196 set_cpus_allowed_ptr(reader_tasks[i],
1197 shuffle_tmp_mask);
1198 }
1199 if (fakewriter_tasks) {
1200 for (i = 0; i < nfakewriters; i++)
1201 if (fakewriter_tasks[i])
1202 set_cpus_allowed_ptr(fakewriter_tasks[i],
1203 shuffle_tmp_mask);
1204 }
1205 if (writer_task)
1206 set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
1207 if (stats_task)
1208 set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
1209 if (stutter_task)
1210 set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask);
1211 if (fqs_task)
1212 set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask);
1213 if (shutdown_task)
1214 set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask);
1215#ifdef CONFIG_HOTPLUG_CPU
1216 if (onoff_task)
1217 set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask);
1218#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1219 if (stall_task)
1220 set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask);
1221 if (barrier_cbs_tasks)
1222 for (i = 0; i < n_barrier_cbs; i++)
1223 if (barrier_cbs_tasks[i])
1224 set_cpus_allowed_ptr(barrier_cbs_tasks[i],
1225 shuffle_tmp_mask);
1226 if (barrier_task)
1227 set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask);
1228
1229 if (rcu_idle_cpu == -1)
1230 rcu_idle_cpu = num_online_cpus() - 1;
1231 else
1232 rcu_idle_cpu--;
1233
1234 put_online_cpus();
1235}
1236
1237/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
1238 * system to become idle at a time and cut off its timer ticks. This is meant
1239 * to test the support for such tickless idle CPU in RCU.
1240 */
1241static int
1242rcu_torture_shuffle(void *arg)
1243{
1244 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started");
1245 do {
1246 schedule_timeout_interruptible(shuffle_interval * HZ);
1247 rcu_torture_shuffle_tasks();
1248 rcutorture_shutdown_absorb("rcu_torture_shuffle");
1249 } while (!kthread_should_stop());
1250 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
1251 return 0;
1252}
1253
1254/* Cause the rcutorture test to "stutter", starting and stopping all
1255 * threads periodically.
1256 */
1257static int
1258rcu_torture_stutter(void *arg)
1259{
1260 VERBOSE_PRINTK_STRING("rcu_torture_stutter task started");
1261 do {
1262 schedule_timeout_interruptible(stutter * HZ);
1263 stutter_pause_test = 1;
1264 if (!kthread_should_stop())
1265 schedule_timeout_interruptible(stutter * HZ);
1266 stutter_pause_test = 0;
1267 rcutorture_shutdown_absorb("rcu_torture_stutter");
1268 } while (!kthread_should_stop());
1269 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
1270 return 0; 1040 return 0;
1271} 1041}
1272 1042
@@ -1293,10 +1063,6 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
1293 onoff_interval, onoff_holdoff); 1063 onoff_interval, onoff_holdoff);
1294} 1064}
1295 1065
1296static struct notifier_block rcutorture_shutdown_nb = {
1297 .notifier_call = rcutorture_shutdown_notify,
1298};
1299
1300static void rcutorture_booster_cleanup(int cpu) 1066static void rcutorture_booster_cleanup(int cpu)
1301{ 1067{
1302 struct task_struct *t; 1068 struct task_struct *t;
@@ -1304,14 +1070,12 @@ static void rcutorture_booster_cleanup(int cpu)
1304 if (boost_tasks[cpu] == NULL) 1070 if (boost_tasks[cpu] == NULL)
1305 return; 1071 return;
1306 mutex_lock(&boost_mutex); 1072 mutex_lock(&boost_mutex);
1307 VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
1308 t = boost_tasks[cpu]; 1073 t = boost_tasks[cpu];
1309 boost_tasks[cpu] = NULL; 1074 boost_tasks[cpu] = NULL;
1310 mutex_unlock(&boost_mutex); 1075 mutex_unlock(&boost_mutex);
1311 1076
1312 /* This must be outside of the mutex, otherwise deadlock! */ 1077 /* This must be outside of the mutex, otherwise deadlock! */
1313 kthread_stop(t); 1078 torture_stop_kthread(rcu_torture_boost, t);
1314 boost_tasks[cpu] = NULL;
1315} 1079}
1316 1080
1317static int rcutorture_booster_init(int cpu) 1081static int rcutorture_booster_init(int cpu)
@@ -1323,13 +1087,13 @@ static int rcutorture_booster_init(int cpu)
1323 1087
1324 /* Don't allow time recalculation while creating a new task. */ 1088 /* Don't allow time recalculation while creating a new task. */
1325 mutex_lock(&boost_mutex); 1089 mutex_lock(&boost_mutex);
1326 VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); 1090 VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task");
1327 boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, 1091 boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
1328 cpu_to_node(cpu), 1092 cpu_to_node(cpu),
1329 "rcu_torture_boost"); 1093 "rcu_torture_boost");
1330 if (IS_ERR(boost_tasks[cpu])) { 1094 if (IS_ERR(boost_tasks[cpu])) {
1331 retval = PTR_ERR(boost_tasks[cpu]); 1095 retval = PTR_ERR(boost_tasks[cpu]);
1332 VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); 1096 VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed");
1333 n_rcu_torture_boost_ktrerror++; 1097 n_rcu_torture_boost_ktrerror++;
1334 boost_tasks[cpu] = NULL; 1098 boost_tasks[cpu] = NULL;
1335 mutex_unlock(&boost_mutex); 1099 mutex_unlock(&boost_mutex);
@@ -1342,175 +1106,6 @@ static int rcutorture_booster_init(int cpu)
1342} 1106}
1343 1107
1344/* 1108/*
1345 * Cause the rcutorture test to shutdown the system after the test has
1346 * run for the time specified by the shutdown_secs module parameter.
1347 */
1348static int
1349rcu_torture_shutdown(void *arg)
1350{
1351 long delta;
1352 unsigned long jiffies_snap;
1353
1354 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
1355 jiffies_snap = ACCESS_ONCE(jiffies);
1356 while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
1357 !kthread_should_stop()) {
1358 delta = shutdown_time - jiffies_snap;
1359 if (verbose)
1360 pr_alert("%s" TORTURE_FLAG
1361 "rcu_torture_shutdown task: %lu jiffies remaining\n",
1362 torture_type, delta);
1363 schedule_timeout_interruptible(delta);
1364 jiffies_snap = ACCESS_ONCE(jiffies);
1365 }
1366 if (kthread_should_stop()) {
1367 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
1368 return 0;
1369 }
1370
1371 /* OK, shut down the system. */
1372
1373 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
1374 shutdown_task = NULL; /* Avoid self-kill deadlock. */
1375 rcu_torture_cleanup(); /* Get the success/failure message. */
1376 kernel_power_off(); /* Shut down the system. */
1377 return 0;
1378}
1379
1380#ifdef CONFIG_HOTPLUG_CPU
1381
1382/*
1383 * Execute random CPU-hotplug operations at the interval specified
1384 * by the onoff_interval.
1385 */
1386static int
1387rcu_torture_onoff(void *arg)
1388{
1389 int cpu;
1390 unsigned long delta;
1391 int maxcpu = -1;
1392 DEFINE_RCU_RANDOM(rand);
1393 int ret;
1394 unsigned long starttime;
1395
1396 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
1397 for_each_online_cpu(cpu)
1398 maxcpu = cpu;
1399 WARN_ON(maxcpu < 0);
1400 if (onoff_holdoff > 0) {
1401 VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff");
1402 schedule_timeout_interruptible(onoff_holdoff * HZ);
1403 VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff");
1404 }
1405 while (!kthread_should_stop()) {
1406 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
1407 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
1408 if (verbose)
1409 pr_alert("%s" TORTURE_FLAG
1410 "rcu_torture_onoff task: offlining %d\n",
1411 torture_type, cpu);
1412 starttime = jiffies;
1413 n_offline_attempts++;
1414 ret = cpu_down(cpu);
1415 if (ret) {
1416 if (verbose)
1417 pr_alert("%s" TORTURE_FLAG
1418 "rcu_torture_onoff task: offline %d failed: errno %d\n",
1419 torture_type, cpu, ret);
1420 } else {
1421 if (verbose)
1422 pr_alert("%s" TORTURE_FLAG
1423 "rcu_torture_onoff task: offlined %d\n",
1424 torture_type, cpu);
1425 n_offline_successes++;
1426 delta = jiffies - starttime;
1427 sum_offline += delta;
1428 if (min_offline < 0) {
1429 min_offline = delta;
1430 max_offline = delta;
1431 }
1432 if (min_offline > delta)
1433 min_offline = delta;
1434 if (max_offline < delta)
1435 max_offline = delta;
1436 }
1437 } else if (cpu_is_hotpluggable(cpu)) {
1438 if (verbose)
1439 pr_alert("%s" TORTURE_FLAG
1440 "rcu_torture_onoff task: onlining %d\n",
1441 torture_type, cpu);
1442 starttime = jiffies;
1443 n_online_attempts++;
1444 ret = cpu_up(cpu);
1445 if (ret) {
1446 if (verbose)
1447 pr_alert("%s" TORTURE_FLAG
1448 "rcu_torture_onoff task: online %d failed: errno %d\n",
1449 torture_type, cpu, ret);
1450 } else {
1451 if (verbose)
1452 pr_alert("%s" TORTURE_FLAG
1453 "rcu_torture_onoff task: onlined %d\n",
1454 torture_type, cpu);
1455 n_online_successes++;
1456 delta = jiffies - starttime;
1457 sum_online += delta;
1458 if (min_online < 0) {
1459 min_online = delta;
1460 max_online = delta;
1461 }
1462 if (min_online > delta)
1463 min_online = delta;
1464 if (max_online < delta)
1465 max_online = delta;
1466 }
1467 }
1468 schedule_timeout_interruptible(onoff_interval * HZ);
1469 }
1470 VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
1471 return 0;
1472}
1473
1474static int
1475rcu_torture_onoff_init(void)
1476{
1477 int ret;
1478
1479 if (onoff_interval <= 0)
1480 return 0;
1481 onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
1482 if (IS_ERR(onoff_task)) {
1483 ret = PTR_ERR(onoff_task);
1484 onoff_task = NULL;
1485 return ret;
1486 }
1487 return 0;
1488}
1489
1490static void rcu_torture_onoff_cleanup(void)
1491{
1492 if (onoff_task == NULL)
1493 return;
1494 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
1495 kthread_stop(onoff_task);
1496 onoff_task = NULL;
1497}
1498
1499#else /* #ifdef CONFIG_HOTPLUG_CPU */
1500
1501static int
1502rcu_torture_onoff_init(void)
1503{
1504 return 0;
1505}
1506
1507static void rcu_torture_onoff_cleanup(void)
1508{
1509}
1510
1511#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1512
1513/*
1514 * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then 1109 * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
1515 * induces a CPU stall for the time specified by stall_cpu. 1110 * induces a CPU stall for the time specified by stall_cpu.
1516 */ 1111 */
@@ -1518,11 +1113,11 @@ static int rcu_torture_stall(void *args)
1518{ 1113{
1519 unsigned long stop_at; 1114 unsigned long stop_at;
1520 1115
1521 VERBOSE_PRINTK_STRING("rcu_torture_stall task started"); 1116 VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
1522 if (stall_cpu_holdoff > 0) { 1117 if (stall_cpu_holdoff > 0) {
1523 VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff"); 1118 VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
1524 schedule_timeout_interruptible(stall_cpu_holdoff * HZ); 1119 schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
1525 VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff"); 1120 VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff");
1526 } 1121 }
1527 if (!kthread_should_stop()) { 1122 if (!kthread_should_stop()) {
1528 stop_at = get_seconds() + stall_cpu; 1123 stop_at = get_seconds() + stall_cpu;
@@ -1536,7 +1131,7 @@ static int rcu_torture_stall(void *args)
1536 rcu_read_unlock(); 1131 rcu_read_unlock();
1537 pr_alert("rcu_torture_stall end.\n"); 1132 pr_alert("rcu_torture_stall end.\n");
1538 } 1133 }
1539 rcutorture_shutdown_absorb("rcu_torture_stall"); 1134 torture_shutdown_absorb("rcu_torture_stall");
1540 while (!kthread_should_stop()) 1135 while (!kthread_should_stop())
1541 schedule_timeout_interruptible(10 * HZ); 1136 schedule_timeout_interruptible(10 * HZ);
1542 return 0; 1137 return 0;
@@ -1545,27 +1140,9 @@ static int rcu_torture_stall(void *args)
1545/* Spawn CPU-stall kthread, if stall_cpu specified. */ 1140/* Spawn CPU-stall kthread, if stall_cpu specified. */
1546static int __init rcu_torture_stall_init(void) 1141static int __init rcu_torture_stall_init(void)
1547{ 1142{
1548 int ret;
1549
1550 if (stall_cpu <= 0) 1143 if (stall_cpu <= 0)
1551 return 0; 1144 return 0;
1552 stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall"); 1145 return torture_create_kthread(rcu_torture_stall, NULL, stall_task);
1553 if (IS_ERR(stall_task)) {
1554 ret = PTR_ERR(stall_task);
1555 stall_task = NULL;
1556 return ret;
1557 }
1558 return 0;
1559}
1560
1561/* Clean up after the CPU-stall kthread, if one was spawned. */
1562static void rcu_torture_stall_cleanup(void)
1563{
1564 if (stall_task == NULL)
1565 return;
1566 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
1567 kthread_stop(stall_task);
1568 stall_task = NULL;
1569} 1146}
1570 1147
1571/* Callback function for RCU barrier testing. */ 1148/* Callback function for RCU barrier testing. */
@@ -1583,28 +1160,24 @@ static int rcu_torture_barrier_cbs(void *arg)
1583 struct rcu_head rcu; 1160 struct rcu_head rcu;
1584 1161
1585 init_rcu_head_on_stack(&rcu); 1162 init_rcu_head_on_stack(&rcu);
1586 VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); 1163 VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task started");
1587 set_user_nice(current, 19); 1164 set_user_nice(current, MAX_NICE);
1588 do { 1165 do {
1589 wait_event(barrier_cbs_wq[myid], 1166 wait_event(barrier_cbs_wq[myid],
1590 (newphase = 1167 (newphase =
1591 ACCESS_ONCE(barrier_phase)) != lastphase || 1168 ACCESS_ONCE(barrier_phase)) != lastphase ||
1592 kthread_should_stop() || 1169 torture_must_stop());
1593 fullstop != FULLSTOP_DONTSTOP);
1594 lastphase = newphase; 1170 lastphase = newphase;
1595 smp_mb(); /* ensure barrier_phase load before ->call(). */ 1171 smp_mb(); /* ensure barrier_phase load before ->call(). */
1596 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) 1172 if (torture_must_stop())
1597 break; 1173 break;
1598 cur_ops->call(&rcu, rcu_torture_barrier_cbf); 1174 cur_ops->call(&rcu, rcu_torture_barrier_cbf);
1599 if (atomic_dec_and_test(&barrier_cbs_count)) 1175 if (atomic_dec_and_test(&barrier_cbs_count))
1600 wake_up(&barrier_wq); 1176 wake_up(&barrier_wq);
1601 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 1177 } while (!torture_must_stop());
1602 VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
1603 rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
1604 while (!kthread_should_stop())
1605 schedule_timeout_interruptible(1);
1606 cur_ops->cb_barrier(); 1178 cur_ops->cb_barrier();
1607 destroy_rcu_head_on_stack(&rcu); 1179 destroy_rcu_head_on_stack(&rcu);
1180 torture_kthread_stopping("rcu_torture_barrier_cbs");
1608 return 0; 1181 return 0;
1609} 1182}
1610 1183
@@ -1613,7 +1186,7 @@ static int rcu_torture_barrier(void *arg)
1613{ 1186{
1614 int i; 1187 int i;
1615 1188
1616 VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); 1189 VERBOSE_TOROUT_STRING("rcu_torture_barrier task starting");
1617 do { 1190 do {
1618 atomic_set(&barrier_cbs_invoked, 0); 1191 atomic_set(&barrier_cbs_invoked, 0);
1619 atomic_set(&barrier_cbs_count, n_barrier_cbs); 1192 atomic_set(&barrier_cbs_count, n_barrier_cbs);
@@ -1623,9 +1196,8 @@ static int rcu_torture_barrier(void *arg)
1623 wake_up(&barrier_cbs_wq[i]); 1196 wake_up(&barrier_cbs_wq[i]);
1624 wait_event(barrier_wq, 1197 wait_event(barrier_wq,
1625 atomic_read(&barrier_cbs_count) == 0 || 1198 atomic_read(&barrier_cbs_count) == 0 ||
1626 kthread_should_stop() || 1199 torture_must_stop());
1627 fullstop != FULLSTOP_DONTSTOP); 1200 if (torture_must_stop())
1628 if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
1629 break; 1201 break;
1630 n_barrier_attempts++; 1202 n_barrier_attempts++;
1631 cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ 1203 cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
@@ -1635,11 +1207,8 @@ static int rcu_torture_barrier(void *arg)
1635 } 1207 }
1636 n_barrier_successes++; 1208 n_barrier_successes++;
1637 schedule_timeout_interruptible(HZ / 10); 1209 schedule_timeout_interruptible(HZ / 10);
1638 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 1210 } while (!torture_must_stop());
1639 VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); 1211 torture_kthread_stopping("rcu_torture_barrier");
1640 rcutorture_shutdown_absorb("rcu_torture_barrier");
1641 while (!kthread_should_stop())
1642 schedule_timeout_interruptible(1);
1643 return 0; 1212 return 0;
1644} 1213}
1645 1214
@@ -1672,24 +1241,13 @@ static int rcu_torture_barrier_init(void)
1672 return -ENOMEM; 1241 return -ENOMEM;
1673 for (i = 0; i < n_barrier_cbs; i++) { 1242 for (i = 0; i < n_barrier_cbs; i++) {
1674 init_waitqueue_head(&barrier_cbs_wq[i]); 1243 init_waitqueue_head(&barrier_cbs_wq[i]);
1675 barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, 1244 ret = torture_create_kthread(rcu_torture_barrier_cbs,
1676 (void *)(long)i, 1245 (void *)(long)i,
1677 "rcu_torture_barrier_cbs"); 1246 barrier_cbs_tasks[i]);
1678 if (IS_ERR(barrier_cbs_tasks[i])) { 1247 if (ret)
1679 ret = PTR_ERR(barrier_cbs_tasks[i]);
1680 VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
1681 barrier_cbs_tasks[i] = NULL;
1682 return ret; 1248 return ret;
1683 }
1684 } 1249 }
1685 barrier_task = kthread_run(rcu_torture_barrier, NULL, 1250 return torture_create_kthread(rcu_torture_barrier, NULL, barrier_task);
1686 "rcu_torture_barrier");
1687 if (IS_ERR(barrier_task)) {
1688 ret = PTR_ERR(barrier_task);
1689 VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
1690 barrier_task = NULL;
1691 }
1692 return 0;
1693} 1251}
1694 1252
1695/* Clean up after RCU barrier testing. */ 1253/* Clean up after RCU barrier testing. */
@@ -1697,19 +1255,11 @@ static void rcu_torture_barrier_cleanup(void)
1697{ 1255{
1698 int i; 1256 int i;
1699 1257
1700 if (barrier_task != NULL) { 1258 torture_stop_kthread(rcu_torture_barrier, barrier_task);
1701 VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
1702 kthread_stop(barrier_task);
1703 barrier_task = NULL;
1704 }
1705 if (barrier_cbs_tasks != NULL) { 1259 if (barrier_cbs_tasks != NULL) {
1706 for (i = 0; i < n_barrier_cbs; i++) { 1260 for (i = 0; i < n_barrier_cbs; i++)
1707 if (barrier_cbs_tasks[i] != NULL) { 1261 torture_stop_kthread(rcu_torture_barrier_cbs,
1708 VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); 1262 barrier_cbs_tasks[i]);
1709 kthread_stop(barrier_cbs_tasks[i]);
1710 barrier_cbs_tasks[i] = NULL;
1711 }
1712 }
1713 kfree(barrier_cbs_tasks); 1263 kfree(barrier_cbs_tasks);
1714 barrier_cbs_tasks = NULL; 1264 barrier_cbs_tasks = NULL;
1715 } 1265 }
@@ -1747,90 +1297,42 @@ rcu_torture_cleanup(void)
1747{ 1297{
1748 int i; 1298 int i;
1749 1299
1750 mutex_lock(&fullstop_mutex);
1751 rcutorture_record_test_transition(); 1300 rcutorture_record_test_transition();
1752 if (fullstop == FULLSTOP_SHUTDOWN) { 1301 if (torture_cleanup()) {
1753 pr_warn(/* but going down anyway, so... */
1754 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
1755 mutex_unlock(&fullstop_mutex);
1756 schedule_timeout_uninterruptible(10);
1757 if (cur_ops->cb_barrier != NULL) 1302 if (cur_ops->cb_barrier != NULL)
1758 cur_ops->cb_barrier(); 1303 cur_ops->cb_barrier();
1759 return; 1304 return;
1760 } 1305 }
1761 fullstop = FULLSTOP_RMMOD;
1762 mutex_unlock(&fullstop_mutex);
1763 unregister_reboot_notifier(&rcutorture_shutdown_nb);
1764 rcu_torture_barrier_cleanup();
1765 rcu_torture_stall_cleanup();
1766 if (stutter_task) {
1767 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
1768 kthread_stop(stutter_task);
1769 }
1770 stutter_task = NULL;
1771 if (shuffler_task) {
1772 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
1773 kthread_stop(shuffler_task);
1774 free_cpumask_var(shuffle_tmp_mask);
1775 }
1776 shuffler_task = NULL;
1777 1306
1778 if (writer_task) { 1307 rcu_torture_barrier_cleanup();
1779 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); 1308 torture_stop_kthread(rcu_torture_stall, stall_task);
1780 kthread_stop(writer_task); 1309 torture_stop_kthread(rcu_torture_writer, writer_task);
1781 }
1782 writer_task = NULL;
1783 1310
1784 if (reader_tasks) { 1311 if (reader_tasks) {
1785 for (i = 0; i < nrealreaders; i++) { 1312 for (i = 0; i < nrealreaders; i++)
1786 if (reader_tasks[i]) { 1313 torture_stop_kthread(rcu_torture_reader,
1787 VERBOSE_PRINTK_STRING( 1314 reader_tasks[i]);
1788 "Stopping rcu_torture_reader task");
1789 kthread_stop(reader_tasks[i]);
1790 }
1791 reader_tasks[i] = NULL;
1792 }
1793 kfree(reader_tasks); 1315 kfree(reader_tasks);
1794 reader_tasks = NULL;
1795 } 1316 }
1796 rcu_torture_current = NULL; 1317 rcu_torture_current = NULL;
1797 1318
1798 if (fakewriter_tasks) { 1319 if (fakewriter_tasks) {
1799 for (i = 0; i < nfakewriters; i++) { 1320 for (i = 0; i < nfakewriters; i++) {
1800 if (fakewriter_tasks[i]) { 1321 torture_stop_kthread(rcu_torture_fakewriter,
1801 VERBOSE_PRINTK_STRING( 1322 fakewriter_tasks[i]);
1802 "Stopping rcu_torture_fakewriter task");
1803 kthread_stop(fakewriter_tasks[i]);
1804 }
1805 fakewriter_tasks[i] = NULL;
1806 } 1323 }
1807 kfree(fakewriter_tasks); 1324 kfree(fakewriter_tasks);
1808 fakewriter_tasks = NULL; 1325 fakewriter_tasks = NULL;
1809 } 1326 }
1810 1327
1811 if (stats_task) { 1328 torture_stop_kthread(rcu_torture_stats, stats_task);
1812 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); 1329 torture_stop_kthread(rcu_torture_fqs, fqs_task);
1813 kthread_stop(stats_task);
1814 }
1815 stats_task = NULL;
1816
1817 if (fqs_task) {
1818 VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
1819 kthread_stop(fqs_task);
1820 }
1821 fqs_task = NULL;
1822 if ((test_boost == 1 && cur_ops->can_boost) || 1330 if ((test_boost == 1 && cur_ops->can_boost) ||
1823 test_boost == 2) { 1331 test_boost == 2) {
1824 unregister_cpu_notifier(&rcutorture_cpu_nb); 1332 unregister_cpu_notifier(&rcutorture_cpu_nb);
1825 for_each_possible_cpu(i) 1333 for_each_possible_cpu(i)
1826 rcutorture_booster_cleanup(i); 1334 rcutorture_booster_cleanup(i);
1827 } 1335 }
1828 if (shutdown_task != NULL) {
1829 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
1830 kthread_stop(shutdown_task);
1831 }
1832 shutdown_task = NULL;
1833 rcu_torture_onoff_cleanup();
1834 1336
1835 /* Wait for all RCU callbacks to fire. */ 1337 /* Wait for all RCU callbacks to fire. */
1836 1338
@@ -1841,8 +1343,7 @@ rcu_torture_cleanup(void)
1841 1343
1842 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) 1344 if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
1843 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1345 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1844 else if (n_online_successes != n_online_attempts || 1346 else if (torture_onoff_failures())
1845 n_offline_successes != n_offline_attempts)
1846 rcu_torture_print_module_parms(cur_ops, 1347 rcu_torture_print_module_parms(cur_ops,
1847 "End of test: RCU_HOTPLUG"); 1348 "End of test: RCU_HOTPLUG");
1848 else 1349 else
@@ -1911,12 +1412,11 @@ rcu_torture_init(void)
1911 int i; 1412 int i;
1912 int cpu; 1413 int cpu;
1913 int firsterr = 0; 1414 int firsterr = 0;
1914 int retval;
1915 static struct rcu_torture_ops *torture_ops[] = { 1415 static struct rcu_torture_ops *torture_ops[] = {
1916 &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, 1416 &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
1917 }; 1417 };
1918 1418
1919 mutex_lock(&fullstop_mutex); 1419 torture_init_begin(torture_type, verbose, &rcutorture_runnable);
1920 1420
1921 /* Process args and tell the world that the torturer is on the job. */ 1421 /* Process args and tell the world that the torturer is on the job. */
1922 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 1422 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
@@ -1931,7 +1431,7 @@ rcu_torture_init(void)
1931 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) 1431 for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
1932 pr_alert(" %s", torture_ops[i]->name); 1432 pr_alert(" %s", torture_ops[i]->name);
1933 pr_alert("\n"); 1433 pr_alert("\n");
1934 mutex_unlock(&fullstop_mutex); 1434 torture_init_end();
1935 return -EINVAL; 1435 return -EINVAL;
1936 } 1436 }
1937 if (cur_ops->fqs == NULL && fqs_duration != 0) { 1437 if (cur_ops->fqs == NULL && fqs_duration != 0) {
@@ -1946,7 +1446,6 @@ rcu_torture_init(void)
1946 else 1446 else
1947 nrealreaders = 2 * num_online_cpus(); 1447 nrealreaders = 2 * num_online_cpus();
1948 rcu_torture_print_module_parms(cur_ops, "Start of test"); 1448 rcu_torture_print_module_parms(cur_ops, "Start of test");
1949 fullstop = FULLSTOP_DONTSTOP;
1950 1449
1951 /* Set up the freelist. */ 1450 /* Set up the freelist. */
1952 1451
@@ -1982,108 +1481,61 @@ rcu_torture_init(void)
1982 1481
1983 /* Start up the kthreads. */ 1482 /* Start up the kthreads. */
1984 1483
1985 VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); 1484 firsterr = torture_create_kthread(rcu_torture_writer, NULL,
1986 writer_task = kthread_create(rcu_torture_writer, NULL, 1485 writer_task);
1987 "rcu_torture_writer"); 1486 if (firsterr)
1988 if (IS_ERR(writer_task)) {
1989 firsterr = PTR_ERR(writer_task);
1990 VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
1991 writer_task = NULL;
1992 goto unwind; 1487 goto unwind;
1993 }
1994 wake_up_process(writer_task);
1995 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), 1488 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
1996 GFP_KERNEL); 1489 GFP_KERNEL);
1997 if (fakewriter_tasks == NULL) { 1490 if (fakewriter_tasks == NULL) {
1998 VERBOSE_PRINTK_ERRSTRING("out of memory"); 1491 VERBOSE_TOROUT_ERRSTRING("out of memory");
1999 firsterr = -ENOMEM; 1492 firsterr = -ENOMEM;
2000 goto unwind; 1493 goto unwind;
2001 } 1494 }
2002 for (i = 0; i < nfakewriters; i++) { 1495 for (i = 0; i < nfakewriters; i++) {
2003 VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); 1496 firsterr = torture_create_kthread(rcu_torture_fakewriter,
2004 fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, 1497 NULL, fakewriter_tasks[i]);
2005 "rcu_torture_fakewriter"); 1498 if (firsterr)
2006 if (IS_ERR(fakewriter_tasks[i])) {
2007 firsterr = PTR_ERR(fakewriter_tasks[i]);
2008 VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
2009 fakewriter_tasks[i] = NULL;
2010 goto unwind; 1499 goto unwind;
2011 }
2012 } 1500 }
2013 reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), 1501 reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
2014 GFP_KERNEL); 1502 GFP_KERNEL);
2015 if (reader_tasks == NULL) { 1503 if (reader_tasks == NULL) {
2016 VERBOSE_PRINTK_ERRSTRING("out of memory"); 1504 VERBOSE_TOROUT_ERRSTRING("out of memory");
2017 firsterr = -ENOMEM; 1505 firsterr = -ENOMEM;
2018 goto unwind; 1506 goto unwind;
2019 } 1507 }
2020 for (i = 0; i < nrealreaders; i++) { 1508 for (i = 0; i < nrealreaders; i++) {
2021 VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); 1509 firsterr = torture_create_kthread(rcu_torture_reader, NULL,
2022 reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, 1510 reader_tasks[i]);
2023 "rcu_torture_reader"); 1511 if (firsterr)
2024 if (IS_ERR(reader_tasks[i])) {
2025 firsterr = PTR_ERR(reader_tasks[i]);
2026 VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
2027 reader_tasks[i] = NULL;
2028 goto unwind; 1512 goto unwind;
2029 }
2030 } 1513 }
2031 if (stat_interval > 0) { 1514 if (stat_interval > 0) {
2032 VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); 1515 firsterr = torture_create_kthread(rcu_torture_stats, NULL,
2033 stats_task = kthread_run(rcu_torture_stats, NULL, 1516 stats_task);
2034 "rcu_torture_stats"); 1517 if (firsterr)
2035 if (IS_ERR(stats_task)) {
2036 firsterr = PTR_ERR(stats_task);
2037 VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
2038 stats_task = NULL;
2039 goto unwind; 1518 goto unwind;
2040 }
2041 } 1519 }
2042 if (test_no_idle_hz) { 1520 if (test_no_idle_hz) {
2043 rcu_idle_cpu = num_online_cpus() - 1; 1521 firsterr = torture_shuffle_init(shuffle_interval * HZ);
2044 1522 if (firsterr)
2045 if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
2046 firsterr = -ENOMEM;
2047 VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask");
2048 goto unwind;
2049 }
2050
2051 /* Create the shuffler thread */
2052 shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
2053 "rcu_torture_shuffle");
2054 if (IS_ERR(shuffler_task)) {
2055 free_cpumask_var(shuffle_tmp_mask);
2056 firsterr = PTR_ERR(shuffler_task);
2057 VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
2058 shuffler_task = NULL;
2059 goto unwind; 1523 goto unwind;
2060 }
2061 } 1524 }
2062 if (stutter < 0) 1525 if (stutter < 0)
2063 stutter = 0; 1526 stutter = 0;
2064 if (stutter) { 1527 if (stutter) {
2065 /* Create the stutter thread */ 1528 firsterr = torture_stutter_init(stutter * HZ);
2066 stutter_task = kthread_run(rcu_torture_stutter, NULL, 1529 if (firsterr)
2067 "rcu_torture_stutter");
2068 if (IS_ERR(stutter_task)) {
2069 firsterr = PTR_ERR(stutter_task);
2070 VERBOSE_PRINTK_ERRSTRING("Failed to create stutter");
2071 stutter_task = NULL;
2072 goto unwind; 1530 goto unwind;
2073 }
2074 } 1531 }
2075 if (fqs_duration < 0) 1532 if (fqs_duration < 0)
2076 fqs_duration = 0; 1533 fqs_duration = 0;
2077 if (fqs_duration) { 1534 if (fqs_duration) {
2078 /* Create the stutter thread */ 1535 /* Create the fqs thread */
2079 fqs_task = kthread_run(rcu_torture_fqs, NULL, 1536 torture_create_kthread(rcu_torture_fqs, NULL, fqs_task);
2080 "rcu_torture_fqs"); 1537 if (firsterr)
2081 if (IS_ERR(fqs_task)) {
2082 firsterr = PTR_ERR(fqs_task);
2083 VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
2084 fqs_task = NULL;
2085 goto unwind; 1538 goto unwind;
2086 }
2087 } 1539 }
2088 if (test_boost_interval < 1) 1540 if (test_boost_interval < 1)
2089 test_boost_interval = 1; 1541 test_boost_interval = 1;
@@ -2097,49 +1549,31 @@ rcu_torture_init(void)
2097 for_each_possible_cpu(i) { 1549 for_each_possible_cpu(i) {
2098 if (cpu_is_offline(i)) 1550 if (cpu_is_offline(i))
2099 continue; /* Heuristic: CPU can go offline. */ 1551 continue; /* Heuristic: CPU can go offline. */
2100 retval = rcutorture_booster_init(i); 1552 firsterr = rcutorture_booster_init(i);
2101 if (retval < 0) { 1553 if (firsterr)
2102 firsterr = retval;
2103 goto unwind; 1554 goto unwind;
2104 }
2105 } 1555 }
2106 } 1556 }
2107 if (shutdown_secs > 0) { 1557 firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
2108 shutdown_time = jiffies + shutdown_secs * HZ; 1558 if (firsterr)
2109 shutdown_task = kthread_create(rcu_torture_shutdown, NULL,
2110 "rcu_torture_shutdown");
2111 if (IS_ERR(shutdown_task)) {
2112 firsterr = PTR_ERR(shutdown_task);
2113 VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
2114 shutdown_task = NULL;
2115 goto unwind;
2116 }
2117 wake_up_process(shutdown_task);
2118 }
2119 i = rcu_torture_onoff_init();
2120 if (i != 0) {
2121 firsterr = i;
2122 goto unwind; 1559 goto unwind;
2123 } 1560 firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ);
2124 register_reboot_notifier(&rcutorture_shutdown_nb); 1561 if (firsterr)
2125 i = rcu_torture_stall_init();
2126 if (i != 0) {
2127 firsterr = i;
2128 goto unwind; 1562 goto unwind;
2129 } 1563 firsterr = rcu_torture_stall_init();
2130 retval = rcu_torture_barrier_init(); 1564 if (firsterr)
2131 if (retval != 0) { 1565 goto unwind;
2132 firsterr = retval; 1566 firsterr = rcu_torture_barrier_init();
1567 if (firsterr)
2133 goto unwind; 1568 goto unwind;
2134 }
2135 if (object_debug) 1569 if (object_debug)
2136 rcu_test_debug_objects(); 1570 rcu_test_debug_objects();
2137 rcutorture_record_test_transition(); 1571 rcutorture_record_test_transition();
2138 mutex_unlock(&fullstop_mutex); 1572 torture_init_end();
2139 return 0; 1573 return 0;
2140 1574
2141unwind: 1575unwind:
2142 mutex_unlock(&fullstop_mutex); 1576 torture_init_end();
2143 rcu_torture_cleanup(); 1577 rcu_torture_cleanup();
2144 return firsterr; 1578 return firsterr;
2145} 1579}
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 3318d8284384..c639556f3fa0 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -12,8 +12,8 @@
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, you can access it online at
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 * 17 *
18 * Copyright (C) IBM Corporation, 2006 18 * Copyright (C) IBM Corporation, 2006
19 * Copyright (C) Fujitsu, 2012 19 * Copyright (C) Fujitsu, 2012
@@ -36,8 +36,6 @@
36#include <linux/delay.h> 36#include <linux/delay.h>
37#include <linux/srcu.h> 37#include <linux/srcu.h>
38 38
39#include <trace/events/rcu.h>
40
41#include "rcu.h" 39#include "rcu.h"
42 40
43/* 41/*
@@ -398,7 +396,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
398 rcu_batch_queue(&sp->batch_queue, head); 396 rcu_batch_queue(&sp->batch_queue, head);
399 if (!sp->running) { 397 if (!sp->running) {
400 sp->running = true; 398 sp->running = true;
401 schedule_delayed_work(&sp->work, 0); 399 queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
402 } 400 }
403 spin_unlock_irqrestore(&sp->queue_lock, flags); 401 spin_unlock_irqrestore(&sp->queue_lock, flags);
404} 402}
@@ -674,7 +672,8 @@ static void srcu_reschedule(struct srcu_struct *sp)
674 } 672 }
675 673
676 if (pending) 674 if (pending)
677 schedule_delayed_work(&sp->work, SRCU_INTERVAL); 675 queue_delayed_work(system_power_efficient_wq,
676 &sp->work, SRCU_INTERVAL);
678} 677}
679 678
680/* 679/*
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 1254f312d024..d9efcc13008c 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -12,8 +12,8 @@
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, you can access it online at
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 * 17 *
18 * Copyright IBM Corporation, 2008 18 * Copyright IBM Corporation, 2008
19 * 19 *
@@ -37,10 +37,6 @@
37#include <linux/prefetch.h> 37#include <linux/prefetch.h>
38#include <linux/ftrace_event.h> 38#include <linux/ftrace_event.h>
39 39
40#ifdef CONFIG_RCU_TRACE
41#include <trace/events/rcu.h>
42#endif /* #else #ifdef CONFIG_RCU_TRACE */
43
44#include "rcu.h" 40#include "rcu.h"
45 41
46/* Forward declarations for tiny_plugin.h. */ 42/* Forward declarations for tiny_plugin.h. */
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 280d06cae352..431528520562 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -14,8 +14,8 @@
14 * GNU General Public License for more details. 14 * GNU General Public License for more details.
15 * 15 *
16 * You should have received a copy of the GNU General Public License 16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software 17 * along with this program; if not, you can access it online at
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 18 * http://www.gnu.org/licenses/gpl-2.0.html.
19 * 19 *
20 * Copyright (c) 2010 Linaro 20 * Copyright (c) 2010 Linaro
21 * 21 *
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b3d116cd072d..0c47e300210a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -12,8 +12,8 @@
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, you can access it online at
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 * 17 *
18 * Copyright IBM Corporation, 2008 18 * Copyright IBM Corporation, 2008
19 * 19 *
@@ -58,8 +58,6 @@
58#include <linux/suspend.h> 58#include <linux/suspend.h>
59 59
60#include "tree.h" 60#include "tree.h"
61#include <trace/events/rcu.h>
62
63#include "rcu.h" 61#include "rcu.h"
64 62
65MODULE_ALIAS("rcutree"); 63MODULE_ALIAS("rcutree");
@@ -837,7 +835,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
837 * to the next. Only do this for the primary flavor of RCU. 835 * to the next. Only do this for the primary flavor of RCU.
838 */ 836 */
839 if (rdp->rsp == rcu_state && 837 if (rdp->rsp == rcu_state &&
840 ULONG_CMP_GE(ACCESS_ONCE(jiffies), rdp->rsp->jiffies_resched)) { 838 ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
841 rdp->rsp->jiffies_resched += 5; 839 rdp->rsp->jiffies_resched += 5;
842 resched_cpu(rdp->cpu); 840 resched_cpu(rdp->cpu);
843 } 841 }
@@ -847,7 +845,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
847 845
848static void record_gp_stall_check_time(struct rcu_state *rsp) 846static void record_gp_stall_check_time(struct rcu_state *rsp)
849{ 847{
850 unsigned long j = ACCESS_ONCE(jiffies); 848 unsigned long j = jiffies;
851 unsigned long j1; 849 unsigned long j1;
852 850
853 rsp->gp_start = j; 851 rsp->gp_start = j;
@@ -1005,7 +1003,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
1005 1003
1006 if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) 1004 if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
1007 return; 1005 return;
1008 j = ACCESS_ONCE(jiffies); 1006 j = jiffies;
1009 1007
1010 /* 1008 /*
1011 * Lots of memory barriers to reject false positives. 1009 * Lots of memory barriers to reject false positives.
@@ -1423,13 +1421,14 @@ static int rcu_gp_init(struct rcu_state *rsp)
1423 1421
1424 /* Advance to a new grace period and initialize state. */ 1422 /* Advance to a new grace period and initialize state. */
1425 record_gp_stall_check_time(rsp); 1423 record_gp_stall_check_time(rsp);
1426 smp_wmb(); /* Record GP times before starting GP. */ 1424 /* Record GP times before starting GP, hence smp_store_release(). */
1427 rsp->gpnum++; 1425 smp_store_release(&rsp->gpnum, rsp->gpnum + 1);
1428 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); 1426 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
1429 raw_spin_unlock_irq(&rnp->lock); 1427 raw_spin_unlock_irq(&rnp->lock);
1430 1428
1431 /* Exclude any concurrent CPU-hotplug operations. */ 1429 /* Exclude any concurrent CPU-hotplug operations. */
1432 mutex_lock(&rsp->onoff_mutex); 1430 mutex_lock(&rsp->onoff_mutex);
1431 smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */
1433 1432
1434 /* 1433 /*
1435 * Set the quiescent-state-needed bits in all the rcu_node 1434 * Set the quiescent-state-needed bits in all the rcu_node
@@ -1557,10 +1556,11 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1557 } 1556 }
1558 rnp = rcu_get_root(rsp); 1557 rnp = rcu_get_root(rsp);
1559 raw_spin_lock_irq(&rnp->lock); 1558 raw_spin_lock_irq(&rnp->lock);
1560 smp_mb__after_unlock_lock(); 1559 smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */
1561 rcu_nocb_gp_set(rnp, nocb); 1560 rcu_nocb_gp_set(rnp, nocb);
1562 1561
1563 rsp->completed = rsp->gpnum; /* Declare grace period done. */ 1562 /* Declare grace period done. */
1563 ACCESS_ONCE(rsp->completed) = rsp->gpnum;
1564 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); 1564 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
1565 rsp->fqs_state = RCU_GP_IDLE; 1565 rsp->fqs_state = RCU_GP_IDLE;
1566 rdp = this_cpu_ptr(rsp->rda); 1566 rdp = this_cpu_ptr(rsp->rda);
@@ -2304,7 +2304,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
2304 if (rnp_old != NULL) 2304 if (rnp_old != NULL)
2305 raw_spin_unlock(&rnp_old->fqslock); 2305 raw_spin_unlock(&rnp_old->fqslock);
2306 if (ret) { 2306 if (ret) {
2307 rsp->n_force_qs_lh++; 2307 ACCESS_ONCE(rsp->n_force_qs_lh)++;
2308 return; 2308 return;
2309 } 2309 }
2310 rnp_old = rnp; 2310 rnp_old = rnp;
@@ -2316,7 +2316,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
2316 smp_mb__after_unlock_lock(); 2316 smp_mb__after_unlock_lock();
2317 raw_spin_unlock(&rnp_old->fqslock); 2317 raw_spin_unlock(&rnp_old->fqslock);
2318 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 2318 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
2319 rsp->n_force_qs_lh++; 2319 ACCESS_ONCE(rsp->n_force_qs_lh)++;
2320 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2320 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
2321 return; /* Someone beat us to it. */ 2321 return; /* Someone beat us to it. */
2322 } 2322 }
@@ -2639,6 +2639,58 @@ void synchronize_rcu_bh(void)
2639} 2639}
2640EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 2640EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
2641 2641
2642/**
2643 * get_state_synchronize_rcu - Snapshot current RCU state
2644 *
2645 * Returns a cookie that is used by a later call to cond_synchronize_rcu()
2646 * to determine whether or not a full grace period has elapsed in the
2647 * meantime.
2648 */
2649unsigned long get_state_synchronize_rcu(void)
2650{
2651 /*
2652 * Any prior manipulation of RCU-protected data must happen
2653 * before the load from ->gpnum.
2654 */
2655 smp_mb(); /* ^^^ */
2656
2657 /*
2658 * Make sure this load happens before the purportedly
2659 * time-consuming work between get_state_synchronize_rcu()
2660 * and cond_synchronize_rcu().
2661 */
2662 return smp_load_acquire(&rcu_state->gpnum);
2663}
2664EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
2665
2666/**
2667 * cond_synchronize_rcu - Conditionally wait for an RCU grace period
2668 *
2669 * @oldstate: return value from earlier call to get_state_synchronize_rcu()
2670 *
2671 * If a full RCU grace period has elapsed since the earlier call to
2672 * get_state_synchronize_rcu(), just return. Otherwise, invoke
2673 * synchronize_rcu() to wait for a full grace period.
2674 *
2675 * Yes, this function does not take counter wrap into account. But
2676 * counter wrap is harmless. If the counter wraps, we have waited for
2677 * more than 2 billion grace periods (and way more on a 64-bit system!),
2678 * so waiting for one additional grace period should be just fine.
2679 */
2680void cond_synchronize_rcu(unsigned long oldstate)
2681{
2682 unsigned long newstate;
2683
2684 /*
2685 * Ensure that this load happens before any RCU-destructive
2686 * actions the caller might carry out after we return.
2687 */
2688 newstate = smp_load_acquire(&rcu_state->completed);
2689 if (ULONG_CMP_GE(oldstate, newstate))
2690 synchronize_rcu();
2691}
2692EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
2693
2642static int synchronize_sched_expedited_cpu_stop(void *data) 2694static int synchronize_sched_expedited_cpu_stop(void *data)
2643{ 2695{
2644 /* 2696 /*
@@ -2880,7 +2932,7 @@ static int rcu_pending(int cpu)
2880 * non-NULL, store an indication of whether all callbacks are lazy. 2932 * non-NULL, store an indication of whether all callbacks are lazy.
2881 * (If there are no callbacks, all of them are deemed to be lazy.) 2933 * (If there are no callbacks, all of them are deemed to be lazy.)
2882 */ 2934 */
2883static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) 2935static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
2884{ 2936{
2885 bool al = true; 2937 bool al = true;
2886 bool hc = false; 2938 bool hc = false;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8c19873f1ac9..75dc3c39a02a 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -13,8 +13,8 @@
13 * GNU General Public License for more details. 13 * GNU General Public License for more details.
14 * 14 *
15 * You should have received a copy of the GNU General Public License 15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software 16 * along with this program; if not, you can access it online at
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 17 * http://www.gnu.org/licenses/gpl-2.0.html.
18 * 18 *
19 * Copyright IBM Corporation, 2008 19 * Copyright IBM Corporation, 2008
20 * 20 *
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 6e2ef4b2b920..962d1d589929 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -14,8 +14,8 @@
14 * GNU General Public License for more details. 14 * GNU General Public License for more details.
15 * 15 *
16 * You should have received a copy of the GNU General Public License 16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software 17 * along with this program; if not, you can access it online at
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 18 * http://www.gnu.org/licenses/gpl-2.0.html.
19 * 19 *
20 * Copyright Red Hat, 2009 20 * Copyright Red Hat, 2009
21 * Copyright IBM Corporation, 2009 21 * Copyright IBM Corporation, 2009
@@ -1586,11 +1586,13 @@ static void rcu_prepare_kthreads(int cpu)
1586 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs 1586 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
1587 * any flavor of RCU. 1587 * any flavor of RCU.
1588 */ 1588 */
1589#ifndef CONFIG_RCU_NOCB_CPU_ALL
1589int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) 1590int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1590{ 1591{
1591 *delta_jiffies = ULONG_MAX; 1592 *delta_jiffies = ULONG_MAX;
1592 return rcu_cpu_has_callbacks(cpu, NULL); 1593 return rcu_cpu_has_callbacks(cpu, NULL);
1593} 1594}
1595#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1594 1596
1595/* 1597/*
1596 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up 1598 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
@@ -1656,7 +1658,7 @@ extern int tick_nohz_active;
1656 * only if it has been awhile since the last time we did so. Afterwards, 1658 * only if it has been awhile since the last time we did so. Afterwards,
1657 * if there are any callbacks ready for immediate invocation, return true. 1659 * if there are any callbacks ready for immediate invocation, return true.
1658 */ 1660 */
1659static bool rcu_try_advance_all_cbs(void) 1661static bool __maybe_unused rcu_try_advance_all_cbs(void)
1660{ 1662{
1661 bool cbs_ready = false; 1663 bool cbs_ready = false;
1662 struct rcu_data *rdp; 1664 struct rcu_data *rdp;
@@ -1696,6 +1698,7 @@ static bool rcu_try_advance_all_cbs(void)
1696 * 1698 *
1697 * The caller must have disabled interrupts. 1699 * The caller must have disabled interrupts.
1698 */ 1700 */
1701#ifndef CONFIG_RCU_NOCB_CPU_ALL
1699int rcu_needs_cpu(int cpu, unsigned long *dj) 1702int rcu_needs_cpu(int cpu, unsigned long *dj)
1700{ 1703{
1701 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1704 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
@@ -1726,6 +1729,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
1726 } 1729 }
1727 return 0; 1730 return 0;
1728} 1731}
1732#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1729 1733
1730/* 1734/*
1731 * Prepare a CPU for idle from an RCU perspective. The first major task 1735 * Prepare a CPU for idle from an RCU perspective. The first major task
@@ -1739,6 +1743,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
1739 */ 1743 */
1740static void rcu_prepare_for_idle(int cpu) 1744static void rcu_prepare_for_idle(int cpu)
1741{ 1745{
1746#ifndef CONFIG_RCU_NOCB_CPU_ALL
1742 struct rcu_data *rdp; 1747 struct rcu_data *rdp;
1743 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1748 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1744 struct rcu_node *rnp; 1749 struct rcu_node *rnp;
@@ -1790,6 +1795,7 @@ static void rcu_prepare_for_idle(int cpu)
1790 rcu_accelerate_cbs(rsp, rnp, rdp); 1795 rcu_accelerate_cbs(rsp, rnp, rdp);
1791 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1796 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1792 } 1797 }
1798#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1793} 1799}
1794 1800
1795/* 1801/*
@@ -1799,11 +1805,12 @@ static void rcu_prepare_for_idle(int cpu)
1799 */ 1805 */
1800static void rcu_cleanup_after_idle(int cpu) 1806static void rcu_cleanup_after_idle(int cpu)
1801{ 1807{
1802 1808#ifndef CONFIG_RCU_NOCB_CPU_ALL
1803 if (rcu_is_nocb_cpu(cpu)) 1809 if (rcu_is_nocb_cpu(cpu))
1804 return; 1810 return;
1805 if (rcu_try_advance_all_cbs()) 1811 if (rcu_try_advance_all_cbs())
1806 invoke_rcu_core(); 1812 invoke_rcu_core();
1813#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1807} 1814}
1808 1815
1809/* 1816/*
@@ -2101,6 +2108,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
2101 init_waitqueue_head(&rnp->nocb_gp_wq[1]); 2108 init_waitqueue_head(&rnp->nocb_gp_wq[1]);
2102} 2109}
2103 2110
2111#ifndef CONFIG_RCU_NOCB_CPU_ALL
2104/* Is the specified CPU a no-CPUs CPU? */ 2112/* Is the specified CPU a no-CPUs CPU? */
2105bool rcu_is_nocb_cpu(int cpu) 2113bool rcu_is_nocb_cpu(int cpu)
2106{ 2114{
@@ -2108,6 +2116,7 @@ bool rcu_is_nocb_cpu(int cpu)
2108 return cpumask_test_cpu(cpu, rcu_nocb_mask); 2116 return cpumask_test_cpu(cpu, rcu_nocb_mask);
2109 return false; 2117 return false;
2110} 2118}
2119#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
2111 2120
2112/* 2121/*
2113 * Enqueue the specified string of rcu_head structures onto the specified 2122 * Enqueue the specified string of rcu_head structures onto the specified
@@ -2893,7 +2902,7 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
2893 * CPU unless the grace period has extended for too long. 2902 * CPU unless the grace period has extended for too long.
2894 * 2903 *
2895 * This code relies on the fact that all NO_HZ_FULL CPUs are also 2904 * This code relies on the fact that all NO_HZ_FULL CPUs are also
2896 * CONFIG_RCU_NOCB_CPUs. 2905 * CONFIG_RCU_NOCB_CPU CPUs.
2897 */ 2906 */
2898static bool rcu_nohz_full_cpu(struct rcu_state *rsp) 2907static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
2899{ 2908{
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 4def475336d4..5cdc62e1beeb 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -12,8 +12,8 @@
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, you can access it online at
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 * 17 *
18 * Copyright IBM Corporation, 2008 18 * Copyright IBM Corporation, 2008
19 * 19 *
@@ -273,7 +273,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
273 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", 273 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
274 rsp->n_force_qs, rsp->n_force_qs_ngp, 274 rsp->n_force_qs, rsp->n_force_qs_ngp,
275 rsp->n_force_qs - rsp->n_force_qs_ngp, 275 rsp->n_force_qs - rsp->n_force_qs_ngp,
276 rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); 276 ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
277 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { 277 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
278 if (rnp->level != level) { 278 if (rnp->level != level) {
279 seq_puts(m, "\n"); 279 seq_puts(m, "\n");
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c54609faf233..4c0a9b0af469 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -12,8 +12,8 @@
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, you can access it online at
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 * 17 *
18 * Copyright IBM Corporation, 2001 18 * Copyright IBM Corporation, 2001
19 * 19 *
@@ -49,7 +49,6 @@
49#include <linux/module.h> 49#include <linux/module.h>
50 50
51#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
52#include <trace/events/rcu.h>
53 52
54#include "rcu.h" 53#include "rcu.h"
55 54
diff --git a/kernel/relay.c b/kernel/relay.c
index 5001c9887db1..52d6a6f56261 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -227,7 +227,7 @@ static void relay_destroy_buf(struct rchan_buf *buf)
227 * relay_remove_buf - remove a channel buffer 227 * relay_remove_buf - remove a channel buffer
228 * @kref: target kernel reference that contains the relay buffer 228 * @kref: target kernel reference that contains the relay buffer
229 * 229 *
230 * Removes the file from the fileystem, which also frees the 230 * Removes the file from the filesystem, which also frees the
231 * rchan_buf_struct and the channel buffer. Should only be called from 231 * rchan_buf_struct and the channel buffer. Should only be called from
232 * kref_put(). 232 * kref_put().
233 */ 233 */
diff --git a/kernel/resource.c b/kernel/resource.c
index 3f285dce9347..8957d686e29b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -432,11 +432,6 @@ static void resource_clip(struct resource *res, resource_size_t min,
432 res->end = max; 432 res->end = max;
433} 433}
434 434
435static bool resource_contains(struct resource *res1, struct resource *res2)
436{
437 return res1->start <= res2->start && res1->end >= res2->end;
438}
439
440/* 435/*
441 * Find empty slot in the resource tree with the given range and 436 * Find empty slot in the resource tree with the given range and
442 * alignment constraints 437 * alignment constraints
@@ -471,10 +466,11 @@ static int __find_resource(struct resource *root, struct resource *old,
471 arch_remove_reservations(&tmp); 466 arch_remove_reservations(&tmp);
472 467
473 /* Check for overflow after ALIGN() */ 468 /* Check for overflow after ALIGN() */
474 avail = *new;
475 avail.start = ALIGN(tmp.start, constraint->align); 469 avail.start = ALIGN(tmp.start, constraint->align);
476 avail.end = tmp.end; 470 avail.end = tmp.end;
471 avail.flags = new->flags & ~IORESOURCE_UNSET;
477 if (avail.start >= tmp.start) { 472 if (avail.start >= tmp.start) {
473 alloc.flags = avail.flags;
478 alloc.start = constraint->alignf(constraint->alignf_data, &avail, 474 alloc.start = constraint->alignf(constraint->alignf_data, &avail,
479 size, constraint->align); 475 size, constraint->align);
480 alloc.end = alloc.start + size - 1; 476 alloc.end = alloc.start + size - 1;
@@ -515,7 +511,7 @@ static int find_resource(struct resource *root, struct resource *new,
515 * @newsize: new size of the resource descriptor 511 * @newsize: new size of the resource descriptor
516 * @constraint: the size and alignment constraints to be met. 512 * @constraint: the size and alignment constraints to be met.
517 */ 513 */
518int reallocate_resource(struct resource *root, struct resource *old, 514static int reallocate_resource(struct resource *root, struct resource *old,
519 resource_size_t newsize, 515 resource_size_t newsize,
520 struct resource_constraint *constraint) 516 struct resource_constraint *constraint)
521{ 517{
@@ -949,8 +945,8 @@ struct resource * __request_region(struct resource *parent,
949 res->name = name; 945 res->name = name;
950 res->start = start; 946 res->start = start;
951 res->end = start + n - 1; 947 res->end = start + n - 1;
952 res->flags = IORESOURCE_BUSY; 948 res->flags = resource_type(parent);
953 res->flags |= flags; 949 res->flags |= IORESOURCE_BUSY | flags;
954 950
955 write_lock(&resource_lock); 951 write_lock(&resource_lock);
956 952
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a95c8c2af2a..ab32b7b0db5c 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
13 13
14obj-y += core.o proc.o clock.o cputime.o 14obj-y += core.o proc.o clock.o cputime.o
15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o 15obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
16obj-y += wait.o completion.o 16obj-y += wait.o completion.o idle.o
17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o 17obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
18obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 18obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
19obj-$(CONFIG_SCHEDSTATS) += stats.o 19obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 4a073539c58e..e73efba98301 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -203,7 +203,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
203 struct autogroup *ag; 203 struct autogroup *ag;
204 int err; 204 int err;
205 205
206 if (nice < -20 || nice > 19) 206 if (nice < MIN_NICE || nice > MAX_NICE)
207 return -EINVAL; 207 return -EINVAL;
208 208
209 err = security_task_setnice(current, nice); 209 err = security_task_setnice(current, nice);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f5c6635b806c..0ff3f34bc7e3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -432,7 +432,7 @@ void hrtick_start(struct rq *rq, u64 delay)
432 if (rq == this_rq()) { 432 if (rq == this_rq()) {
433 __hrtick_restart(rq); 433 __hrtick_restart(rq);
434 } else if (!rq->hrtick_csd_pending) { 434 } else if (!rq->hrtick_csd_pending) {
435 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); 435 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
436 rq->hrtick_csd_pending = 1; 436 rq->hrtick_csd_pending = 1;
437 } 437 }
438} 438}
@@ -555,12 +555,15 @@ void resched_cpu(int cpu)
555 * selecting an idle cpu will add more delays to the timers than intended 555 * selecting an idle cpu will add more delays to the timers than intended
556 * (as that cpu's timer base may not be uptodate wrt jiffies etc). 556 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
557 */ 557 */
558int get_nohz_timer_target(void) 558int get_nohz_timer_target(int pinned)
559{ 559{
560 int cpu = smp_processor_id(); 560 int cpu = smp_processor_id();
561 int i; 561 int i;
562 struct sched_domain *sd; 562 struct sched_domain *sd;
563 563
564 if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
565 return cpu;
566
564 rcu_read_lock(); 567 rcu_read_lock();
565 for_each_domain(cpu, sd) { 568 for_each_domain(cpu, sd) {
566 for_each_cpu(i, sched_domain_span(sd)) { 569 for_each_cpu(i, sched_domain_span(sd)) {
@@ -823,19 +826,13 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
823#endif 826#endif
824#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 827#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
825 if (static_key_false((&paravirt_steal_rq_enabled))) { 828 if (static_key_false((&paravirt_steal_rq_enabled))) {
826 u64 st;
827
828 steal = paravirt_steal_clock(cpu_of(rq)); 829 steal = paravirt_steal_clock(cpu_of(rq));
829 steal -= rq->prev_steal_time_rq; 830 steal -= rq->prev_steal_time_rq;
830 831
831 if (unlikely(steal > delta)) 832 if (unlikely(steal > delta))
832 steal = delta; 833 steal = delta;
833 834
834 st = steal_ticks(steal);
835 steal = st * TICK_NSEC;
836
837 rq->prev_steal_time_rq += steal; 835 rq->prev_steal_time_rq += steal;
838
839 delta -= steal; 836 delta -= steal;
840 } 837 }
841#endif 838#endif
@@ -1745,8 +1742,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1745 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1742 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1746 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1743 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1747 p->numa_work.next = &p->numa_work; 1744 p->numa_work.next = &p->numa_work;
1748 p->numa_faults = NULL; 1745 p->numa_faults_memory = NULL;
1749 p->numa_faults_buffer = NULL; 1746 p->numa_faults_buffer_memory = NULL;
1747 p->last_task_numa_placement = 0;
1748 p->last_sum_exec_runtime = 0;
1750 1749
1751 INIT_LIST_HEAD(&p->numa_entry); 1750 INIT_LIST_HEAD(&p->numa_entry);
1752 p->numa_group = NULL; 1751 p->numa_group = NULL;
@@ -2149,8 +2148,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2149 if (mm) 2148 if (mm)
2150 mmdrop(mm); 2149 mmdrop(mm);
2151 if (unlikely(prev_state == TASK_DEAD)) { 2150 if (unlikely(prev_state == TASK_DEAD)) {
2152 task_numa_free(prev);
2153
2154 if (prev->sched_class->task_dead) 2151 if (prev->sched_class->task_dead)
2155 prev->sched_class->task_dead(prev); 2152 prev->sched_class->task_dead(prev);
2156 2153
@@ -2167,13 +2164,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2167 2164
2168#ifdef CONFIG_SMP 2165#ifdef CONFIG_SMP
2169 2166
2170/* assumes rq->lock is held */
2171static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2172{
2173 if (prev->sched_class->pre_schedule)
2174 prev->sched_class->pre_schedule(rq, prev);
2175}
2176
2177/* rq->lock is NOT held, but preemption is disabled */ 2167/* rq->lock is NOT held, but preemption is disabled */
2178static inline void post_schedule(struct rq *rq) 2168static inline void post_schedule(struct rq *rq)
2179{ 2169{
@@ -2191,10 +2181,6 @@ static inline void post_schedule(struct rq *rq)
2191 2181
2192#else 2182#else
2193 2183
2194static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2195{
2196}
2197
2198static inline void post_schedule(struct rq *rq) 2184static inline void post_schedule(struct rq *rq)
2199{ 2185{
2200} 2186}
@@ -2510,8 +2496,13 @@ void __kprobes preempt_count_add(int val)
2510 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 2496 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2511 PREEMPT_MASK - 10); 2497 PREEMPT_MASK - 10);
2512#endif 2498#endif
2513 if (preempt_count() == val) 2499 if (preempt_count() == val) {
2514 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2500 unsigned long ip = get_parent_ip(CALLER_ADDR1);
2501#ifdef CONFIG_DEBUG_PREEMPT
2502 current->preempt_disable_ip = ip;
2503#endif
2504 trace_preempt_off(CALLER_ADDR0, ip);
2505 }
2515} 2506}
2516EXPORT_SYMBOL(preempt_count_add); 2507EXPORT_SYMBOL(preempt_count_add);
2517 2508
@@ -2554,6 +2545,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
2554 print_modules(); 2545 print_modules();
2555 if (irqs_disabled()) 2546 if (irqs_disabled())
2556 print_irqtrace_events(prev); 2547 print_irqtrace_events(prev);
2548#ifdef CONFIG_DEBUG_PREEMPT
2549 if (in_atomic_preempt_off()) {
2550 pr_err("Preemption disabled at:");
2551 print_ip_sym(current->preempt_disable_ip);
2552 pr_cont("\n");
2553 }
2554#endif
2557 dump_stack(); 2555 dump_stack();
2558 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 2556 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
2559} 2557}
@@ -2577,36 +2575,34 @@ static inline void schedule_debug(struct task_struct *prev)
2577 schedstat_inc(this_rq(), sched_count); 2575 schedstat_inc(this_rq(), sched_count);
2578} 2576}
2579 2577
2580static void put_prev_task(struct rq *rq, struct task_struct *prev)
2581{
2582 if (prev->on_rq || rq->skip_clock_update < 0)
2583 update_rq_clock(rq);
2584 prev->sched_class->put_prev_task(rq, prev);
2585}
2586
2587/* 2578/*
2588 * Pick up the highest-prio task: 2579 * Pick up the highest-prio task:
2589 */ 2580 */
2590static inline struct task_struct * 2581static inline struct task_struct *
2591pick_next_task(struct rq *rq) 2582pick_next_task(struct rq *rq, struct task_struct *prev)
2592{ 2583{
2593 const struct sched_class *class; 2584 const struct sched_class *class = &fair_sched_class;
2594 struct task_struct *p; 2585 struct task_struct *p;
2595 2586
2596 /* 2587 /*
2597 * Optimization: we know that if all tasks are in 2588 * Optimization: we know that if all tasks are in
2598 * the fair class we can call that function directly: 2589 * the fair class we can call that function directly:
2599 */ 2590 */
2600 if (likely(rq->nr_running == rq->cfs.h_nr_running)) { 2591 if (likely(prev->sched_class == class &&
2601 p = fair_sched_class.pick_next_task(rq); 2592 rq->nr_running == rq->cfs.h_nr_running)) {
2602 if (likely(p)) 2593 p = fair_sched_class.pick_next_task(rq, prev);
2594 if (likely(p && p != RETRY_TASK))
2603 return p; 2595 return p;
2604 } 2596 }
2605 2597
2598again:
2606 for_each_class(class) { 2599 for_each_class(class) {
2607 p = class->pick_next_task(rq); 2600 p = class->pick_next_task(rq, prev);
2608 if (p) 2601 if (p) {
2602 if (unlikely(p == RETRY_TASK))
2603 goto again;
2609 return p; 2604 return p;
2605 }
2610 } 2606 }
2611 2607
2612 BUG(); /* the idle class will always have a runnable task */ 2608 BUG(); /* the idle class will always have a runnable task */
@@ -2700,13 +2696,10 @@ need_resched:
2700 switch_count = &prev->nvcsw; 2696 switch_count = &prev->nvcsw;
2701 } 2697 }
2702 2698
2703 pre_schedule(rq, prev); 2699 if (prev->on_rq || rq->skip_clock_update < 0)
2704 2700 update_rq_clock(rq);
2705 if (unlikely(!rq->nr_running))
2706 idle_balance(cpu, rq);
2707 2701
2708 put_prev_task(rq, prev); 2702 next = pick_next_task(rq, prev);
2709 next = pick_next_task(rq);
2710 clear_tsk_need_resched(prev); 2703 clear_tsk_need_resched(prev);
2711 clear_preempt_need_resched(); 2704 clear_preempt_need_resched();
2712 rq->skip_clock_update = 0; 2705 rq->skip_clock_update = 0;
@@ -2852,52 +2845,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
2852} 2845}
2853EXPORT_SYMBOL(default_wake_function); 2846EXPORT_SYMBOL(default_wake_function);
2854 2847
2855static long __sched
2856sleep_on_common(wait_queue_head_t *q, int state, long timeout)
2857{
2858 unsigned long flags;
2859 wait_queue_t wait;
2860
2861 init_waitqueue_entry(&wait, current);
2862
2863 __set_current_state(state);
2864
2865 spin_lock_irqsave(&q->lock, flags);
2866 __add_wait_queue(q, &wait);
2867 spin_unlock(&q->lock);
2868 timeout = schedule_timeout(timeout);
2869 spin_lock_irq(&q->lock);
2870 __remove_wait_queue(q, &wait);
2871 spin_unlock_irqrestore(&q->lock, flags);
2872
2873 return timeout;
2874}
2875
2876void __sched interruptible_sleep_on(wait_queue_head_t *q)
2877{
2878 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
2879}
2880EXPORT_SYMBOL(interruptible_sleep_on);
2881
2882long __sched
2883interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
2884{
2885 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
2886}
2887EXPORT_SYMBOL(interruptible_sleep_on_timeout);
2888
2889void __sched sleep_on(wait_queue_head_t *q)
2890{
2891 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
2892}
2893EXPORT_SYMBOL(sleep_on);
2894
2895long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
2896{
2897 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
2898}
2899EXPORT_SYMBOL(sleep_on_timeout);
2900
2901#ifdef CONFIG_RT_MUTEXES 2848#ifdef CONFIG_RT_MUTEXES
2902 2849
2903/* 2850/*
@@ -2908,7 +2855,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
2908 * This function changes the 'effective' priority of a task. It does 2855 * This function changes the 'effective' priority of a task. It does
2909 * not touch ->normal_prio like __setscheduler(). 2856 * not touch ->normal_prio like __setscheduler().
2910 * 2857 *
2911 * Used by the rt_mutex code to implement priority inheritance logic. 2858 * Used by the rt_mutex code to implement priority inheritance
2859 * logic. Call site only calls if the priority of the task changed.
2912 */ 2860 */
2913void rt_mutex_setprio(struct task_struct *p, int prio) 2861void rt_mutex_setprio(struct task_struct *p, int prio)
2914{ 2862{
@@ -2998,7 +2946,7 @@ void set_user_nice(struct task_struct *p, long nice)
2998 unsigned long flags; 2946 unsigned long flags;
2999 struct rq *rq; 2947 struct rq *rq;
3000 2948
3001 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 2949 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
3002 return; 2950 return;
3003 /* 2951 /*
3004 * We have to be careful, if called from sys_setpriority(), 2952 * We have to be careful, if called from sys_setpriority(),
@@ -3076,11 +3024,11 @@ SYSCALL_DEFINE1(nice, int, increment)
3076 if (increment > 40) 3024 if (increment > 40)
3077 increment = 40; 3025 increment = 40;
3078 3026
3079 nice = TASK_NICE(current) + increment; 3027 nice = task_nice(current) + increment;
3080 if (nice < -20) 3028 if (nice < MIN_NICE)
3081 nice = -20; 3029 nice = MIN_NICE;
3082 if (nice > 19) 3030 if (nice > MAX_NICE)
3083 nice = 19; 3031 nice = MAX_NICE;
3084 3032
3085 if (increment < 0 && !can_nice(current, nice)) 3033 if (increment < 0 && !can_nice(current, nice))
3086 return -EPERM; 3034 return -EPERM;
@@ -3109,18 +3057,6 @@ int task_prio(const struct task_struct *p)
3109} 3057}
3110 3058
3111/** 3059/**
3112 * task_nice - return the nice value of a given task.
3113 * @p: the task in question.
3114 *
3115 * Return: The nice value [ -20 ... 0 ... 19 ].
3116 */
3117int task_nice(const struct task_struct *p)
3118{
3119 return TASK_NICE(p);
3120}
3121EXPORT_SYMBOL(task_nice);
3122
3123/**
3124 * idle_cpu - is a given cpu idle currently? 3060 * idle_cpu - is a given cpu idle currently?
3125 * @cpu: the processor in question. 3061 * @cpu: the processor in question.
3126 * 3062 *
@@ -3189,9 +3125,8 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3189 dl_se->dl_new = 1; 3125 dl_se->dl_new = 1;
3190} 3126}
3191 3127
3192/* Actually do priority change: must hold pi & rq lock. */ 3128static void __setscheduler_params(struct task_struct *p,
3193static void __setscheduler(struct rq *rq, struct task_struct *p, 3129 const struct sched_attr *attr)
3194 const struct sched_attr *attr)
3195{ 3130{
3196 int policy = attr->sched_policy; 3131 int policy = attr->sched_policy;
3197 3132
@@ -3211,9 +3146,21 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
3211 * getparam()/getattr() don't report silly values for !rt tasks. 3146 * getparam()/getattr() don't report silly values for !rt tasks.
3212 */ 3147 */
3213 p->rt_priority = attr->sched_priority; 3148 p->rt_priority = attr->sched_priority;
3214
3215 p->normal_prio = normal_prio(p); 3149 p->normal_prio = normal_prio(p);
3216 p->prio = rt_mutex_getprio(p); 3150 set_load_weight(p);
3151}
3152
3153/* Actually do priority change: must hold pi & rq lock. */
3154static void __setscheduler(struct rq *rq, struct task_struct *p,
3155 const struct sched_attr *attr)
3156{
3157 __setscheduler_params(p, attr);
3158
3159 /*
3160 * If we get here, there was no pi waiters boosting the
3161 * task. It is safe to use the normal prio.
3162 */
3163 p->prio = normal_prio(p);
3217 3164
3218 if (dl_prio(p->prio)) 3165 if (dl_prio(p->prio))
3219 p->sched_class = &dl_sched_class; 3166 p->sched_class = &dl_sched_class;
@@ -3221,8 +3168,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
3221 p->sched_class = &rt_sched_class; 3168 p->sched_class = &rt_sched_class;
3222 else 3169 else
3223 p->sched_class = &fair_sched_class; 3170 p->sched_class = &fair_sched_class;
3224
3225 set_load_weight(p);
3226} 3171}
3227 3172
3228static void 3173static void
@@ -3275,6 +3220,8 @@ static int __sched_setscheduler(struct task_struct *p,
3275 const struct sched_attr *attr, 3220 const struct sched_attr *attr,
3276 bool user) 3221 bool user)
3277{ 3222{
3223 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
3224 MAX_RT_PRIO - 1 - attr->sched_priority;
3278 int retval, oldprio, oldpolicy = -1, on_rq, running; 3225 int retval, oldprio, oldpolicy = -1, on_rq, running;
3279 int policy = attr->sched_policy; 3226 int policy = attr->sched_policy;
3280 unsigned long flags; 3227 unsigned long flags;
@@ -3319,7 +3266,7 @@ recheck:
3319 */ 3266 */
3320 if (user && !capable(CAP_SYS_NICE)) { 3267 if (user && !capable(CAP_SYS_NICE)) {
3321 if (fair_policy(policy)) { 3268 if (fair_policy(policy)) {
3322 if (attr->sched_nice < TASK_NICE(p) && 3269 if (attr->sched_nice < task_nice(p) &&
3323 !can_nice(p, attr->sched_nice)) 3270 !can_nice(p, attr->sched_nice))
3324 return -EPERM; 3271 return -EPERM;
3325 } 3272 }
@@ -3352,7 +3299,7 @@ recheck:
3352 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3299 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
3353 */ 3300 */
3354 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 3301 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3355 if (!can_nice(p, TASK_NICE(p))) 3302 if (!can_nice(p, task_nice(p)))
3356 return -EPERM; 3303 return -EPERM;
3357 } 3304 }
3358 3305
@@ -3389,16 +3336,18 @@ recheck:
3389 } 3336 }
3390 3337
3391 /* 3338 /*
3392 * If not changing anything there's no need to proceed further: 3339 * If not changing anything there's no need to proceed further,
3340 * but store a possible modification of reset_on_fork.
3393 */ 3341 */
3394 if (unlikely(policy == p->policy)) { 3342 if (unlikely(policy == p->policy)) {
3395 if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p)) 3343 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
3396 goto change; 3344 goto change;
3397 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3345 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3398 goto change; 3346 goto change;
3399 if (dl_policy(policy)) 3347 if (dl_policy(policy))
3400 goto change; 3348 goto change;
3401 3349
3350 p->sched_reset_on_fork = reset_on_fork;
3402 task_rq_unlock(rq, p, &flags); 3351 task_rq_unlock(rq, p, &flags);
3403 return 0; 3352 return 0;
3404 } 3353 }
@@ -3452,6 +3401,24 @@ change:
3452 return -EBUSY; 3401 return -EBUSY;
3453 } 3402 }
3454 3403
3404 p->sched_reset_on_fork = reset_on_fork;
3405 oldprio = p->prio;
3406
3407 /*
3408 * Special case for priority boosted tasks.
3409 *
3410 * If the new priority is lower or equal (user space view)
3411 * than the current (boosted) priority, we just store the new
3412 * normal parameters and do not touch the scheduler class and
3413 * the runqueue. This will be done when the task deboost
3414 * itself.
3415 */
3416 if (rt_mutex_check_prio(p, newprio)) {
3417 __setscheduler_params(p, attr);
3418 task_rq_unlock(rq, p, &flags);
3419 return 0;
3420 }
3421
3455 on_rq = p->on_rq; 3422 on_rq = p->on_rq;
3456 running = task_current(rq, p); 3423 running = task_current(rq, p);
3457 if (on_rq) 3424 if (on_rq)
@@ -3459,16 +3426,18 @@ change:
3459 if (running) 3426 if (running)
3460 p->sched_class->put_prev_task(rq, p); 3427 p->sched_class->put_prev_task(rq, p);
3461 3428
3462 p->sched_reset_on_fork = reset_on_fork;
3463
3464 oldprio = p->prio;
3465 prev_class = p->sched_class; 3429 prev_class = p->sched_class;
3466 __setscheduler(rq, p, attr); 3430 __setscheduler(rq, p, attr);
3467 3431
3468 if (running) 3432 if (running)
3469 p->sched_class->set_curr_task(rq); 3433 p->sched_class->set_curr_task(rq);
3470 if (on_rq) 3434 if (on_rq) {
3471 enqueue_task(rq, p, 0); 3435 /*
3436 * We enqueue to tail when the priority of a task is
3437 * increased (user space view).
3438 */
3439 enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
3440 }
3472 3441
3473 check_class_changed(rq, p, prev_class, oldprio); 3442 check_class_changed(rq, p, prev_class, oldprio);
3474 task_rq_unlock(rq, p, &flags); 3443 task_rq_unlock(rq, p, &flags);
@@ -3624,7 +3593,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
3624 * XXX: do we want to be lenient like existing syscalls; or do we want 3593 * XXX: do we want to be lenient like existing syscalls; or do we want
3625 * to be strict and return an error on out-of-bounds values? 3594 * to be strict and return an error on out-of-bounds values?
3626 */ 3595 */
3627 attr->sched_nice = clamp(attr->sched_nice, -20, 19); 3596 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
3628 3597
3629out: 3598out:
3630 return ret; 3599 return ret;
@@ -3845,7 +3814,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3845 else if (task_has_rt_policy(p)) 3814 else if (task_has_rt_policy(p))
3846 attr.sched_priority = p->rt_priority; 3815 attr.sched_priority = p->rt_priority;
3847 else 3816 else
3848 attr.sched_nice = TASK_NICE(p); 3817 attr.sched_nice = task_nice(p);
3849 3818
3850 rcu_read_unlock(); 3819 rcu_read_unlock();
3851 3820
@@ -4483,6 +4452,7 @@ void init_idle(struct task_struct *idle, int cpu)
4483 rcu_read_unlock(); 4452 rcu_read_unlock();
4484 4453
4485 rq->curr = rq->idle = idle; 4454 rq->curr = rq->idle = idle;
4455 idle->on_rq = 1;
4486#if defined(CONFIG_SMP) 4456#if defined(CONFIG_SMP)
4487 idle->on_cpu = 1; 4457 idle->on_cpu = 1;
4488#endif 4458#endif
@@ -4702,8 +4672,10 @@ void idle_task_exit(void)
4702 4672
4703 BUG_ON(cpu_online(smp_processor_id())); 4673 BUG_ON(cpu_online(smp_processor_id()));
4704 4674
4705 if (mm != &init_mm) 4675 if (mm != &init_mm) {
4706 switch_mm(mm, &init_mm, current); 4676 switch_mm(mm, &init_mm, current);
4677 finish_arch_post_lock_switch();
4678 }
4707 mmdrop(mm); 4679 mmdrop(mm);
4708} 4680}
4709 4681
@@ -4721,6 +4693,22 @@ static void calc_load_migrate(struct rq *rq)
4721 atomic_long_add(delta, &calc_load_tasks); 4693 atomic_long_add(delta, &calc_load_tasks);
4722} 4694}
4723 4695
4696static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
4697{
4698}
4699
4700static const struct sched_class fake_sched_class = {
4701 .put_prev_task = put_prev_task_fake,
4702};
4703
4704static struct task_struct fake_task = {
4705 /*
4706 * Avoid pull_{rt,dl}_task()
4707 */
4708 .prio = MAX_PRIO + 1,
4709 .sched_class = &fake_sched_class,
4710};
4711
4724/* 4712/*
4725 * Migrate all tasks from the rq, sleeping tasks will be migrated by 4713 * Migrate all tasks from the rq, sleeping tasks will be migrated by
4726 * try_to_wake_up()->select_task_rq(). 4714 * try_to_wake_up()->select_task_rq().
@@ -4761,7 +4749,7 @@ static void migrate_tasks(unsigned int dead_cpu)
4761 if (rq->nr_running == 1) 4749 if (rq->nr_running == 1)
4762 break; 4750 break;
4763 4751
4764 next = pick_next_task(rq); 4752 next = pick_next_task(rq, &fake_task);
4765 BUG_ON(!next); 4753 BUG_ON(!next);
4766 next->sched_class->put_prev_task(rq, next); 4754 next->sched_class->put_prev_task(rq, next);
4767 4755
@@ -4851,7 +4839,7 @@ set_table_entry(struct ctl_table *entry,
4851static struct ctl_table * 4839static struct ctl_table *
4852sd_alloc_ctl_domain_table(struct sched_domain *sd) 4840sd_alloc_ctl_domain_table(struct sched_domain *sd)
4853{ 4841{
4854 struct ctl_table *table = sd_alloc_ctl_entry(13); 4842 struct ctl_table *table = sd_alloc_ctl_entry(14);
4855 4843
4856 if (table == NULL) 4844 if (table == NULL)
4857 return NULL; 4845 return NULL;
@@ -4879,9 +4867,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
4879 sizeof(int), 0644, proc_dointvec_minmax, false); 4867 sizeof(int), 0644, proc_dointvec_minmax, false);
4880 set_table_entry(&table[10], "flags", &sd->flags, 4868 set_table_entry(&table[10], "flags", &sd->flags,
4881 sizeof(int), 0644, proc_dointvec_minmax, false); 4869 sizeof(int), 0644, proc_dointvec_minmax, false);
4882 set_table_entry(&table[11], "name", sd->name, 4870 set_table_entry(&table[11], "max_newidle_lb_cost",
4871 &sd->max_newidle_lb_cost,
4872 sizeof(long), 0644, proc_doulongvec_minmax, false);
4873 set_table_entry(&table[12], "name", sd->name,
4883 CORENAME_MAX_SIZE, 0444, proc_dostring, false); 4874 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
4884 /* &table[12] is terminator */ 4875 /* &table[13] is terminator */
4885 4876
4886 return table; 4877 return table;
4887} 4878}
@@ -6858,7 +6849,6 @@ void __init sched_init(void)
6858 6849
6859 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 6850 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6860#ifdef CONFIG_RT_GROUP_SCHED 6851#ifdef CONFIG_RT_GROUP_SCHED
6861 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
6862 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 6852 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6863#endif 6853#endif
6864 6854
@@ -6947,7 +6937,8 @@ void __might_sleep(const char *file, int line, int preempt_offset)
6947 static unsigned long prev_jiffy; /* ratelimiting */ 6937 static unsigned long prev_jiffy; /* ratelimiting */
6948 6938
6949 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 6939 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
6950 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 6940 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6941 !is_idle_task(current)) ||
6951 system_state != SYSTEM_RUNNING || oops_in_progress) 6942 system_state != SYSTEM_RUNNING || oops_in_progress)
6952 return; 6943 return;
6953 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6944 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
@@ -6965,6 +6956,13 @@ void __might_sleep(const char *file, int line, int preempt_offset)
6965 debug_show_held_locks(current); 6956 debug_show_held_locks(current);
6966 if (irqs_disabled()) 6957 if (irqs_disabled())
6967 print_irqtrace_events(current); 6958 print_irqtrace_events(current);
6959#ifdef CONFIG_DEBUG_PREEMPT
6960 if (!preempt_count_equals(preempt_offset)) {
6961 pr_err("Preemption disabled at:");
6962 print_ip_sym(current->preempt_disable_ip);
6963 pr_cont("\n");
6964 }
6965#endif
6968 dump_stack(); 6966 dump_stack();
6969} 6967}
6970EXPORT_SYMBOL(__might_sleep); 6968EXPORT_SYMBOL(__might_sleep);
@@ -7018,7 +7016,7 @@ void normalize_rt_tasks(void)
7018 * Renice negative nice level userspace 7016 * Renice negative nice level userspace
7019 * tasks back to 0: 7017 * tasks back to 0:
7020 */ 7018 */
7021 if (TASK_NICE(p) < 0 && p->mm) 7019 if (task_nice(p) < 0 && p->mm)
7022 set_user_nice(p, 0); 7020 set_user_nice(p, 0);
7023 continue; 7021 continue;
7024 } 7022 }
@@ -7186,7 +7184,7 @@ void sched_move_task(struct task_struct *tsk)
7186 if (unlikely(running)) 7184 if (unlikely(running))
7187 tsk->sched_class->put_prev_task(rq, tsk); 7185 tsk->sched_class->put_prev_task(rq, tsk);
7188 7186
7189 tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, 7187 tg = container_of(task_css_check(tsk, cpu_cgrp_id,
7190 lockdep_is_held(&tsk->sighand->siglock)), 7188 lockdep_is_held(&tsk->sighand->siglock)),
7191 struct task_group, css); 7189 struct task_group, css);
7192 tg = autogroup_task_group(tsk, tg); 7190 tg = autogroup_task_group(tsk, tg);
@@ -7613,7 +7611,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7613{ 7611{
7614 struct task_struct *task; 7612 struct task_struct *task;
7615 7613
7616 cgroup_taskset_for_each(task, css, tset) { 7614 cgroup_taskset_for_each(task, tset) {
7617#ifdef CONFIG_RT_GROUP_SCHED 7615#ifdef CONFIG_RT_GROUP_SCHED
7618 if (!sched_rt_can_attach(css_tg(css), task)) 7616 if (!sched_rt_can_attach(css_tg(css), task))
7619 return -EINVAL; 7617 return -EINVAL;
@@ -7631,7 +7629,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
7631{ 7629{
7632 struct task_struct *task; 7630 struct task_struct *task;
7633 7631
7634 cgroup_taskset_for_each(task, css, tset) 7632 cgroup_taskset_for_each(task, tset)
7635 sched_move_task(task); 7633 sched_move_task(task);
7636} 7634}
7637 7635
@@ -7970,8 +7968,7 @@ static struct cftype cpu_files[] = {
7970 { } /* terminate */ 7968 { } /* terminate */
7971}; 7969};
7972 7970
7973struct cgroup_subsys cpu_cgroup_subsys = { 7971struct cgroup_subsys cpu_cgrp_subsys = {
7974 .name = "cpu",
7975 .css_alloc = cpu_cgroup_css_alloc, 7972 .css_alloc = cpu_cgroup_css_alloc,
7976 .css_free = cpu_cgroup_css_free, 7973 .css_free = cpu_cgroup_css_free,
7977 .css_online = cpu_cgroup_css_online, 7974 .css_online = cpu_cgroup_css_online,
@@ -7979,7 +7976,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7979 .can_attach = cpu_cgroup_can_attach, 7976 .can_attach = cpu_cgroup_can_attach,
7980 .attach = cpu_cgroup_attach, 7977 .attach = cpu_cgroup_attach,
7981 .exit = cpu_cgroup_exit, 7978 .exit = cpu_cgroup_exit,
7982 .subsys_id = cpu_cgroup_subsys_id,
7983 .base_cftypes = cpu_files, 7979 .base_cftypes = cpu_files,
7984 .early_init = 1, 7980 .early_init = 1,
7985}; 7981};
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 622e0818f905..c143ee380e3a 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -41,7 +41,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
41/* return cpu accounting group to which this task belongs */ 41/* return cpu accounting group to which this task belongs */
42static inline struct cpuacct *task_ca(struct task_struct *tsk) 42static inline struct cpuacct *task_ca(struct task_struct *tsk)
43{ 43{
44 return css_ca(task_css(tsk, cpuacct_subsys_id)); 44 return css_ca(task_css(tsk, cpuacct_cgrp_id));
45} 45}
46 46
47static inline struct cpuacct *parent_ca(struct cpuacct *ca) 47static inline struct cpuacct *parent_ca(struct cpuacct *ca)
@@ -275,11 +275,9 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
275 rcu_read_unlock(); 275 rcu_read_unlock();
276} 276}
277 277
278struct cgroup_subsys cpuacct_subsys = { 278struct cgroup_subsys cpuacct_cgrp_subsys = {
279 .name = "cpuacct",
280 .css_alloc = cpuacct_css_alloc, 279 .css_alloc = cpuacct_css_alloc,
281 .css_free = cpuacct_css_free, 280 .css_free = cpuacct_css_free,
282 .subsys_id = cpuacct_subsys_id,
283 .base_cftypes = files, 281 .base_cftypes = files,
284 .early_init = 1, 282 .early_init = 1,
285}; 283};
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 99947919e30b..a95097cb4591 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
142 p->utimescaled += cputime_scaled; 142 p->utimescaled += cputime_scaled;
143 account_group_user_time(p, cputime); 143 account_group_user_time(p, cputime);
144 144
145 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; 145 index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
146 146
147 /* Add user time to cpustat. */ 147 /* Add user time to cpustat. */
148 task_group_account_field(p, index, (__force u64) cputime); 148 task_group_account_field(p, index, (__force u64) cputime);
@@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
169 p->gtime += cputime; 169 p->gtime += cputime;
170 170
171 /* Add guest time to cpustat. */ 171 /* Add guest time to cpustat. */
172 if (TASK_NICE(p) > 0) { 172 if (task_nice(p) > 0) {
173 cpustat[CPUTIME_NICE] += (__force u64) cputime; 173 cpustat[CPUTIME_NICE] += (__force u64) cputime;
174 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; 174 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
175 } else { 175 } else {
@@ -258,16 +258,22 @@ static __always_inline bool steal_account_process_tick(void)
258{ 258{
259#ifdef CONFIG_PARAVIRT 259#ifdef CONFIG_PARAVIRT
260 if (static_key_false(&paravirt_steal_enabled)) { 260 if (static_key_false(&paravirt_steal_enabled)) {
261 u64 steal, st = 0; 261 u64 steal;
262 cputime_t steal_ct;
262 263
263 steal = paravirt_steal_clock(smp_processor_id()); 264 steal = paravirt_steal_clock(smp_processor_id());
264 steal -= this_rq()->prev_steal_time; 265 steal -= this_rq()->prev_steal_time;
265 266
266 st = steal_ticks(steal); 267 /*
267 this_rq()->prev_steal_time += st * TICK_NSEC; 268 * cputime_t may be less precise than nsecs (eg: if it's
269 * based on jiffies). Lets cast the result to cputime
270 * granularity and account the rest on the next rounds.
271 */
272 steal_ct = nsecs_to_cputime(steal);
273 this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
268 274
269 account_steal_time(st); 275 account_steal_time(steal_ct);
270 return st; 276 return steal_ct;
271 } 277 }
272#endif 278#endif
273 return false; 279 return false;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6e79b3faa4cd..27ef40925525 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -210,6 +210,16 @@ static inline int has_pushable_dl_tasks(struct rq *rq)
210 210
211static int push_dl_task(struct rq *rq); 211static int push_dl_task(struct rq *rq);
212 212
213static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
214{
215 return dl_task(prev);
216}
217
218static inline void set_post_schedule(struct rq *rq)
219{
220 rq->post_schedule = has_pushable_dl_tasks(rq);
221}
222
213#else 223#else
214 224
215static inline 225static inline
@@ -232,6 +242,19 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
232{ 242{
233} 243}
234 244
245static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
246{
247 return false;
248}
249
250static inline int pull_dl_task(struct rq *rq)
251{
252 return 0;
253}
254
255static inline void set_post_schedule(struct rq *rq)
256{
257}
235#endif /* CONFIG_SMP */ 258#endif /* CONFIG_SMP */
236 259
237static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); 260static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -586,8 +609,8 @@ static void update_curr_dl(struct rq *rq)
586 * approach need further study. 609 * approach need further study.
587 */ 610 */
588 delta_exec = rq_clock_task(rq) - curr->se.exec_start; 611 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
589 if (unlikely((s64)delta_exec < 0)) 612 if (unlikely((s64)delta_exec <= 0))
590 delta_exec = 0; 613 return;
591 614
592 schedstat_set(curr->se.statistics.exec_max, 615 schedstat_set(curr->se.statistics.exec_max,
593 max(curr->se.statistics.exec_max, delta_exec)); 616 max(curr->se.statistics.exec_max, delta_exec));
@@ -942,6 +965,8 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
942 resched_task(rq->curr); 965 resched_task(rq->curr);
943} 966}
944 967
968static int pull_dl_task(struct rq *this_rq);
969
945#endif /* CONFIG_SMP */ 970#endif /* CONFIG_SMP */
946 971
947/* 972/*
@@ -988,7 +1013,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
988 return rb_entry(left, struct sched_dl_entity, rb_node); 1013 return rb_entry(left, struct sched_dl_entity, rb_node);
989} 1014}
990 1015
991struct task_struct *pick_next_task_dl(struct rq *rq) 1016struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
992{ 1017{
993 struct sched_dl_entity *dl_se; 1018 struct sched_dl_entity *dl_se;
994 struct task_struct *p; 1019 struct task_struct *p;
@@ -996,9 +1021,20 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
996 1021
997 dl_rq = &rq->dl; 1022 dl_rq = &rq->dl;
998 1023
1024 if (need_pull_dl_task(rq, prev))
1025 pull_dl_task(rq);
1026 /*
1027 * When prev is DL, we may throttle it in put_prev_task().
1028 * So, we update time before we check for dl_nr_running.
1029 */
1030 if (prev->sched_class == &dl_sched_class)
1031 update_curr_dl(rq);
1032
999 if (unlikely(!dl_rq->dl_nr_running)) 1033 if (unlikely(!dl_rq->dl_nr_running))
1000 return NULL; 1034 return NULL;
1001 1035
1036 put_prev_task(rq, prev);
1037
1002 dl_se = pick_next_dl_entity(rq, dl_rq); 1038 dl_se = pick_next_dl_entity(rq, dl_rq);
1003 BUG_ON(!dl_se); 1039 BUG_ON(!dl_se);
1004 1040
@@ -1013,9 +1049,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
1013 start_hrtick_dl(rq, p); 1049 start_hrtick_dl(rq, p);
1014#endif 1050#endif
1015 1051
1016#ifdef CONFIG_SMP 1052 set_post_schedule(rq);
1017 rq->post_schedule = has_pushable_dl_tasks(rq);
1018#endif /* CONFIG_SMP */
1019 1053
1020 return p; 1054 return p;
1021} 1055}
@@ -1424,13 +1458,6 @@ skip:
1424 return ret; 1458 return ret;
1425} 1459}
1426 1460
1427static void pre_schedule_dl(struct rq *rq, struct task_struct *prev)
1428{
1429 /* Try to pull other tasks here */
1430 if (dl_task(prev))
1431 pull_dl_task(rq);
1432}
1433
1434static void post_schedule_dl(struct rq *rq) 1461static void post_schedule_dl(struct rq *rq)
1435{ 1462{
1436 push_dl_tasks(rq); 1463 push_dl_tasks(rq);
@@ -1558,7 +1585,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1558 if (unlikely(p->dl.dl_throttled)) 1585 if (unlikely(p->dl.dl_throttled))
1559 return; 1586 return;
1560 1587
1561 if (p->on_rq || rq->curr != p) { 1588 if (p->on_rq && rq->curr != p) {
1562#ifdef CONFIG_SMP 1589#ifdef CONFIG_SMP
1563 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) 1590 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
1564 /* Only reschedule if pushing failed */ 1591 /* Only reschedule if pushing failed */
@@ -1623,7 +1650,6 @@ const struct sched_class dl_sched_class = {
1623 .set_cpus_allowed = set_cpus_allowed_dl, 1650 .set_cpus_allowed = set_cpus_allowed_dl,
1624 .rq_online = rq_online_dl, 1651 .rq_online = rq_online_dl,
1625 .rq_offline = rq_offline_dl, 1652 .rq_offline = rq_offline_dl,
1626 .pre_schedule = pre_schedule_dl,
1627 .post_schedule = post_schedule_dl, 1653 .post_schedule = post_schedule_dl,
1628 .task_woken = task_woken_dl, 1654 .task_woken = task_woken_dl,
1629#endif 1655#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index dd52e7ffb10e..695f9773bb60 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -111,8 +111,7 @@ static char *task_group_path(struct task_group *tg)
111 if (autogroup_path(tg, group_path, PATH_MAX)) 111 if (autogroup_path(tg, group_path, PATH_MAX))
112 return group_path; 112 return group_path;
113 113
114 cgroup_path(tg->css.cgroup, group_path, PATH_MAX); 114 return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
115 return group_path;
116} 115}
117#endif 116#endif
118 117
@@ -321,6 +320,7 @@ do { \
321 P(sched_goidle); 320 P(sched_goidle);
322#ifdef CONFIG_SMP 321#ifdef CONFIG_SMP
323 P64(avg_idle); 322 P64(avg_idle);
323 P64(max_idle_balance_cost);
324#endif 324#endif
325 325
326 P(ttwu_count); 326 P(ttwu_count);
@@ -533,15 +533,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
533 unsigned long nr_faults = -1; 533 unsigned long nr_faults = -1;
534 int cpu_current, home_node; 534 int cpu_current, home_node;
535 535
536 if (p->numa_faults) 536 if (p->numa_faults_memory)
537 nr_faults = p->numa_faults[2*node + i]; 537 nr_faults = p->numa_faults_memory[2*node + i];
538 538
539 cpu_current = !i ? (task_node(p) == node) : 539 cpu_current = !i ? (task_node(p) == node) :
540 (pol && node_isset(node, pol->v.nodes)); 540 (pol && node_isset(node, pol->v.nodes));
541 541
542 home_node = (p->numa_preferred_nid == node); 542 home_node = (p->numa_preferred_nid == node);
543 543
544 SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", 544 SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
545 i, node, cpu_current, home_node, nr_faults); 545 i, node, cpu_current, home_node, nr_faults);
546 } 546 }
547 } 547 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9b4c4f320130..7e9bd0b1fa9e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
322 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 322 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
323 323
324/* Do the two (enqueued) entities belong to the same group ? */ 324/* Do the two (enqueued) entities belong to the same group ? */
325static inline int 325static inline struct cfs_rq *
326is_same_group(struct sched_entity *se, struct sched_entity *pse) 326is_same_group(struct sched_entity *se, struct sched_entity *pse)
327{ 327{
328 if (se->cfs_rq == pse->cfs_rq) 328 if (se->cfs_rq == pse->cfs_rq)
329 return 1; 329 return se->cfs_rq;
330 330
331 return 0; 331 return NULL;
332} 332}
333 333
334static inline struct sched_entity *parent_entity(struct sched_entity *se) 334static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
336 return se->parent; 336 return se->parent;
337} 337}
338 338
339/* return depth at which a sched entity is present in the hierarchy */
340static inline int depth_se(struct sched_entity *se)
341{
342 int depth = 0;
343
344 for_each_sched_entity(se)
345 depth++;
346
347 return depth;
348}
349
350static void 339static void
351find_matching_se(struct sched_entity **se, struct sched_entity **pse) 340find_matching_se(struct sched_entity **se, struct sched_entity **pse)
352{ 341{
@@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
360 */ 349 */
361 350
362 /* First walk up until both entities are at same depth */ 351 /* First walk up until both entities are at same depth */
363 se_depth = depth_se(*se); 352 se_depth = (*se)->depth;
364 pse_depth = depth_se(*pse); 353 pse_depth = (*pse)->depth;
365 354
366 while (se_depth > pse_depth) { 355 while (se_depth > pse_depth) {
367 se_depth--; 356 se_depth--;
@@ -426,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
426#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 415#define for_each_leaf_cfs_rq(rq, cfs_rq) \
427 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 416 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
428 417
429static inline int
430is_same_group(struct sched_entity *se, struct sched_entity *pse)
431{
432 return 1;
433}
434
435static inline struct sched_entity *parent_entity(struct sched_entity *se) 418static inline struct sched_entity *parent_entity(struct sched_entity *se)
436{ 419{
437 return NULL; 420 return NULL;
@@ -819,14 +802,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
819/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 802/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
820unsigned int sysctl_numa_balancing_scan_delay = 1000; 803unsigned int sysctl_numa_balancing_scan_delay = 1000;
821 804
822/*
823 * After skipping a page migration on a shared page, skip N more numa page
824 * migrations unconditionally. This reduces the number of NUMA migrations
825 * in shared memory workloads, and has the effect of pulling tasks towards
826 * where their memory lives, over pulling the memory towards the task.
827 */
828unsigned int sysctl_numa_balancing_migrate_deferred = 16;
829
830static unsigned int task_nr_scan_windows(struct task_struct *p) 805static unsigned int task_nr_scan_windows(struct task_struct *p)
831{ 806{
832 unsigned long rss = 0; 807 unsigned long rss = 0;
@@ -893,10 +868,26 @@ struct numa_group {
893 struct list_head task_list; 868 struct list_head task_list;
894 869
895 struct rcu_head rcu; 870 struct rcu_head rcu;
871 nodemask_t active_nodes;
896 unsigned long total_faults; 872 unsigned long total_faults;
873 /*
874 * Faults_cpu is used to decide whether memory should move
875 * towards the CPU. As a consequence, these stats are weighted
876 * more by CPU use than by memory faults.
877 */
878 unsigned long *faults_cpu;
897 unsigned long faults[0]; 879 unsigned long faults[0];
898}; 880};
899 881
882/* Shared or private faults. */
883#define NR_NUMA_HINT_FAULT_TYPES 2
884
885/* Memory and CPU locality */
886#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
887
888/* Averaged statistics, and temporary buffers. */
889#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
890
900pid_t task_numa_group_id(struct task_struct *p) 891pid_t task_numa_group_id(struct task_struct *p)
901{ 892{
902 return p->numa_group ? p->numa_group->gid : 0; 893 return p->numa_group ? p->numa_group->gid : 0;
@@ -904,16 +895,16 @@ pid_t task_numa_group_id(struct task_struct *p)
904 895
905static inline int task_faults_idx(int nid, int priv) 896static inline int task_faults_idx(int nid, int priv)
906{ 897{
907 return 2 * nid + priv; 898 return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
908} 899}
909 900
910static inline unsigned long task_faults(struct task_struct *p, int nid) 901static inline unsigned long task_faults(struct task_struct *p, int nid)
911{ 902{
912 if (!p->numa_faults) 903 if (!p->numa_faults_memory)
913 return 0; 904 return 0;
914 905
915 return p->numa_faults[task_faults_idx(nid, 0)] + 906 return p->numa_faults_memory[task_faults_idx(nid, 0)] +
916 p->numa_faults[task_faults_idx(nid, 1)]; 907 p->numa_faults_memory[task_faults_idx(nid, 1)];
917} 908}
918 909
919static inline unsigned long group_faults(struct task_struct *p, int nid) 910static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -925,6 +916,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
925 p->numa_group->faults[task_faults_idx(nid, 1)]; 916 p->numa_group->faults[task_faults_idx(nid, 1)];
926} 917}
927 918
919static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
920{
921 return group->faults_cpu[task_faults_idx(nid, 0)] +
922 group->faults_cpu[task_faults_idx(nid, 1)];
923}
924
928/* 925/*
929 * These return the fraction of accesses done by a particular task, or 926 * These return the fraction of accesses done by a particular task, or
930 * task group, on a particular numa node. The group weight is given a 927 * task group, on a particular numa node. The group weight is given a
@@ -935,7 +932,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
935{ 932{
936 unsigned long total_faults; 933 unsigned long total_faults;
937 934
938 if (!p->numa_faults) 935 if (!p->numa_faults_memory)
939 return 0; 936 return 0;
940 937
941 total_faults = p->total_numa_faults; 938 total_faults = p->total_numa_faults;
@@ -954,6 +951,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid)
954 return 1000 * group_faults(p, nid) / p->numa_group->total_faults; 951 return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
955} 952}
956 953
954bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
955 int src_nid, int dst_cpu)
956{
957 struct numa_group *ng = p->numa_group;
958 int dst_nid = cpu_to_node(dst_cpu);
959 int last_cpupid, this_cpupid;
960
961 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
962
963 /*
964 * Multi-stage node selection is used in conjunction with a periodic
965 * migration fault to build a temporal task<->page relation. By using
966 * a two-stage filter we remove short/unlikely relations.
967 *
968 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
969 * a task's usage of a particular page (n_p) per total usage of this
970 * page (n_t) (in a given time-span) to a probability.
971 *
972 * Our periodic faults will sample this probability and getting the
973 * same result twice in a row, given these samples are fully
974 * independent, is then given by P(n)^2, provided our sample period
975 * is sufficiently short compared to the usage pattern.
976 *
977 * This quadric squishes small probabilities, making it less likely we
978 * act on an unlikely task<->page relation.
979 */
980 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
981 if (!cpupid_pid_unset(last_cpupid) &&
982 cpupid_to_nid(last_cpupid) != dst_nid)
983 return false;
984
985 /* Always allow migrate on private faults */
986 if (cpupid_match_pid(p, last_cpupid))
987 return true;
988
989 /* A shared fault, but p->numa_group has not been set up yet. */
990 if (!ng)
991 return true;
992
993 /*
994 * Do not migrate if the destination is not a node that
995 * is actively used by this numa group.
996 */
997 if (!node_isset(dst_nid, ng->active_nodes))
998 return false;
999
1000 /*
1001 * Source is a node that is not actively used by this
1002 * numa group, while the destination is. Migrate.
1003 */
1004 if (!node_isset(src_nid, ng->active_nodes))
1005 return true;
1006
1007 /*
1008 * Both source and destination are nodes in active
1009 * use by this numa group. Maximize memory bandwidth
1010 * by migrating from more heavily used groups, to less
1011 * heavily used ones, spreading the load around.
1012 * Use a 1/4 hysteresis to avoid spurious page movement.
1013 */
1014 return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1015}
1016
957static unsigned long weighted_cpuload(const int cpu); 1017static unsigned long weighted_cpuload(const int cpu);
958static unsigned long source_load(int cpu, int type); 1018static unsigned long source_load(int cpu, int type);
959static unsigned long target_load(int cpu, int type); 1019static unsigned long target_load(int cpu, int type);
@@ -1267,7 +1327,7 @@ static int task_numa_migrate(struct task_struct *p)
1267static void numa_migrate_preferred(struct task_struct *p) 1327static void numa_migrate_preferred(struct task_struct *p)
1268{ 1328{
1269 /* This task has no NUMA fault statistics yet */ 1329 /* This task has no NUMA fault statistics yet */
1270 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) 1330 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
1271 return; 1331 return;
1272 1332
1273 /* Periodically retry migrating the task to the preferred node */ 1333 /* Periodically retry migrating the task to the preferred node */
@@ -1282,6 +1342,38 @@ static void numa_migrate_preferred(struct task_struct *p)
1282} 1342}
1283 1343
1284/* 1344/*
1345 * Find the nodes on which the workload is actively running. We do this by
1346 * tracking the nodes from which NUMA hinting faults are triggered. This can
1347 * be different from the set of nodes where the workload's memory is currently
1348 * located.
1349 *
1350 * The bitmask is used to make smarter decisions on when to do NUMA page
1351 * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1352 * are added when they cause over 6/16 of the maximum number of faults, but
1353 * only removed when they drop below 3/16.
1354 */
1355static void update_numa_active_node_mask(struct numa_group *numa_group)
1356{
1357 unsigned long faults, max_faults = 0;
1358 int nid;
1359
1360 for_each_online_node(nid) {
1361 faults = group_faults_cpu(numa_group, nid);
1362 if (faults > max_faults)
1363 max_faults = faults;
1364 }
1365
1366 for_each_online_node(nid) {
1367 faults = group_faults_cpu(numa_group, nid);
1368 if (!node_isset(nid, numa_group->active_nodes)) {
1369 if (faults > max_faults * 6 / 16)
1370 node_set(nid, numa_group->active_nodes);
1371 } else if (faults < max_faults * 3 / 16)
1372 node_clear(nid, numa_group->active_nodes);
1373 }
1374}
1375
1376/*
1285 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS 1377 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1286 * increments. The more local the fault statistics are, the higher the scan 1378 * increments. The more local the fault statistics are, the higher the scan
1287 * period will be for the next scan window. If local/remote ratio is below 1379 * period will be for the next scan window. If local/remote ratio is below
@@ -1355,11 +1447,41 @@ static void update_task_scan_period(struct task_struct *p,
1355 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 1447 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1356} 1448}
1357 1449
1450/*
1451 * Get the fraction of time the task has been running since the last
1452 * NUMA placement cycle. The scheduler keeps similar statistics, but
1453 * decays those on a 32ms period, which is orders of magnitude off
1454 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1455 * stats only if the task is so new there are no NUMA statistics yet.
1456 */
1457static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1458{
1459 u64 runtime, delta, now;
1460 /* Use the start of this time slice to avoid calculations. */
1461 now = p->se.exec_start;
1462 runtime = p->se.sum_exec_runtime;
1463
1464 if (p->last_task_numa_placement) {
1465 delta = runtime - p->last_sum_exec_runtime;
1466 *period = now - p->last_task_numa_placement;
1467 } else {
1468 delta = p->se.avg.runnable_avg_sum;
1469 *period = p->se.avg.runnable_avg_period;
1470 }
1471
1472 p->last_sum_exec_runtime = runtime;
1473 p->last_task_numa_placement = now;
1474
1475 return delta;
1476}
1477
1358static void task_numa_placement(struct task_struct *p) 1478static void task_numa_placement(struct task_struct *p)
1359{ 1479{
1360 int seq, nid, max_nid = -1, max_group_nid = -1; 1480 int seq, nid, max_nid = -1, max_group_nid = -1;
1361 unsigned long max_faults = 0, max_group_faults = 0; 1481 unsigned long max_faults = 0, max_group_faults = 0;
1362 unsigned long fault_types[2] = { 0, 0 }; 1482 unsigned long fault_types[2] = { 0, 0 };
1483 unsigned long total_faults;
1484 u64 runtime, period;
1363 spinlock_t *group_lock = NULL; 1485 spinlock_t *group_lock = NULL;
1364 1486
1365 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1487 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1368,6 +1490,10 @@ static void task_numa_placement(struct task_struct *p)
1368 p->numa_scan_seq = seq; 1490 p->numa_scan_seq = seq;
1369 p->numa_scan_period_max = task_scan_max(p); 1491 p->numa_scan_period_max = task_scan_max(p);
1370 1492
1493 total_faults = p->numa_faults_locality[0] +
1494 p->numa_faults_locality[1];
1495 runtime = numa_get_avg_runtime(p, &period);
1496
1371 /* If the task is part of a group prevent parallel updates to group stats */ 1497 /* If the task is part of a group prevent parallel updates to group stats */
1372 if (p->numa_group) { 1498 if (p->numa_group) {
1373 group_lock = &p->numa_group->lock; 1499 group_lock = &p->numa_group->lock;
@@ -1379,24 +1505,37 @@ static void task_numa_placement(struct task_struct *p)
1379 unsigned long faults = 0, group_faults = 0; 1505 unsigned long faults = 0, group_faults = 0;
1380 int priv, i; 1506 int priv, i;
1381 1507
1382 for (priv = 0; priv < 2; priv++) { 1508 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1383 long diff; 1509 long diff, f_diff, f_weight;
1384 1510
1385 i = task_faults_idx(nid, priv); 1511 i = task_faults_idx(nid, priv);
1386 diff = -p->numa_faults[i];
1387 1512
1388 /* Decay existing window, copy faults since last scan */ 1513 /* Decay existing window, copy faults since last scan */
1389 p->numa_faults[i] >>= 1; 1514 diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
1390 p->numa_faults[i] += p->numa_faults_buffer[i]; 1515 fault_types[priv] += p->numa_faults_buffer_memory[i];
1391 fault_types[priv] += p->numa_faults_buffer[i]; 1516 p->numa_faults_buffer_memory[i] = 0;
1392 p->numa_faults_buffer[i] = 0;
1393 1517
1394 faults += p->numa_faults[i]; 1518 /*
1395 diff += p->numa_faults[i]; 1519 * Normalize the faults_from, so all tasks in a group
1520 * count according to CPU use, instead of by the raw
1521 * number of faults. Tasks with little runtime have
1522 * little over-all impact on throughput, and thus their
1523 * faults are less important.
1524 */
1525 f_weight = div64_u64(runtime << 16, period + 1);
1526 f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
1527 (total_faults + 1);
1528 f_diff = f_weight - p->numa_faults_cpu[i] / 2;
1529 p->numa_faults_buffer_cpu[i] = 0;
1530
1531 p->numa_faults_memory[i] += diff;
1532 p->numa_faults_cpu[i] += f_diff;
1533 faults += p->numa_faults_memory[i];
1396 p->total_numa_faults += diff; 1534 p->total_numa_faults += diff;
1397 if (p->numa_group) { 1535 if (p->numa_group) {
1398 /* safe because we can only change our own group */ 1536 /* safe because we can only change our own group */
1399 p->numa_group->faults[i] += diff; 1537 p->numa_group->faults[i] += diff;
1538 p->numa_group->faults_cpu[i] += f_diff;
1400 p->numa_group->total_faults += diff; 1539 p->numa_group->total_faults += diff;
1401 group_faults += p->numa_group->faults[i]; 1540 group_faults += p->numa_group->faults[i];
1402 } 1541 }
@@ -1416,6 +1555,7 @@ static void task_numa_placement(struct task_struct *p)
1416 update_task_scan_period(p, fault_types[0], fault_types[1]); 1555 update_task_scan_period(p, fault_types[0], fault_types[1]);
1417 1556
1418 if (p->numa_group) { 1557 if (p->numa_group) {
1558 update_numa_active_node_mask(p->numa_group);
1419 /* 1559 /*
1420 * If the preferred task and group nids are different, 1560 * If the preferred task and group nids are different,
1421 * iterate over the nodes again to find the best place. 1561 * iterate over the nodes again to find the best place.
@@ -1465,7 +1605,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1465 1605
1466 if (unlikely(!p->numa_group)) { 1606 if (unlikely(!p->numa_group)) {
1467 unsigned int size = sizeof(struct numa_group) + 1607 unsigned int size = sizeof(struct numa_group) +
1468 2*nr_node_ids*sizeof(unsigned long); 1608 4*nr_node_ids*sizeof(unsigned long);
1469 1609
1470 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); 1610 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1471 if (!grp) 1611 if (!grp)
@@ -1475,9 +1615,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1475 spin_lock_init(&grp->lock); 1615 spin_lock_init(&grp->lock);
1476 INIT_LIST_HEAD(&grp->task_list); 1616 INIT_LIST_HEAD(&grp->task_list);
1477 grp->gid = p->pid; 1617 grp->gid = p->pid;
1618 /* Second half of the array tracks nids where faults happen */
1619 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
1620 nr_node_ids;
1621
1622 node_set(task_node(current), grp->active_nodes);
1478 1623
1479 for (i = 0; i < 2*nr_node_ids; i++) 1624 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1480 grp->faults[i] = p->numa_faults[i]; 1625 grp->faults[i] = p->numa_faults_memory[i];
1481 1626
1482 grp->total_faults = p->total_numa_faults; 1627 grp->total_faults = p->total_numa_faults;
1483 1628
@@ -1534,9 +1679,9 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1534 1679
1535 double_lock(&my_grp->lock, &grp->lock); 1680 double_lock(&my_grp->lock, &grp->lock);
1536 1681
1537 for (i = 0; i < 2*nr_node_ids; i++) { 1682 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
1538 my_grp->faults[i] -= p->numa_faults[i]; 1683 my_grp->faults[i] -= p->numa_faults_memory[i];
1539 grp->faults[i] += p->numa_faults[i]; 1684 grp->faults[i] += p->numa_faults_memory[i];
1540 } 1685 }
1541 my_grp->total_faults -= p->total_numa_faults; 1686 my_grp->total_faults -= p->total_numa_faults;
1542 grp->total_faults += p->total_numa_faults; 1687 grp->total_faults += p->total_numa_faults;
@@ -1562,12 +1707,12 @@ void task_numa_free(struct task_struct *p)
1562{ 1707{
1563 struct numa_group *grp = p->numa_group; 1708 struct numa_group *grp = p->numa_group;
1564 int i; 1709 int i;
1565 void *numa_faults = p->numa_faults; 1710 void *numa_faults = p->numa_faults_memory;
1566 1711
1567 if (grp) { 1712 if (grp) {
1568 spin_lock(&grp->lock); 1713 spin_lock(&grp->lock);
1569 for (i = 0; i < 2*nr_node_ids; i++) 1714 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1570 grp->faults[i] -= p->numa_faults[i]; 1715 grp->faults[i] -= p->numa_faults_memory[i];
1571 grp->total_faults -= p->total_numa_faults; 1716 grp->total_faults -= p->total_numa_faults;
1572 1717
1573 list_del(&p->numa_entry); 1718 list_del(&p->numa_entry);
@@ -1577,18 +1722,21 @@ void task_numa_free(struct task_struct *p)
1577 put_numa_group(grp); 1722 put_numa_group(grp);
1578 } 1723 }
1579 1724
1580 p->numa_faults = NULL; 1725 p->numa_faults_memory = NULL;
1581 p->numa_faults_buffer = NULL; 1726 p->numa_faults_buffer_memory = NULL;
1727 p->numa_faults_cpu= NULL;
1728 p->numa_faults_buffer_cpu = NULL;
1582 kfree(numa_faults); 1729 kfree(numa_faults);
1583} 1730}
1584 1731
1585/* 1732/*
1586 * Got a PROT_NONE fault for a page on @node. 1733 * Got a PROT_NONE fault for a page on @node.
1587 */ 1734 */
1588void task_numa_fault(int last_cpupid, int node, int pages, int flags) 1735void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1589{ 1736{
1590 struct task_struct *p = current; 1737 struct task_struct *p = current;
1591 bool migrated = flags & TNF_MIGRATED; 1738 bool migrated = flags & TNF_MIGRATED;
1739 int cpu_node = task_node(current);
1592 int priv; 1740 int priv;
1593 1741
1594 if (!numabalancing_enabled) 1742 if (!numabalancing_enabled)
@@ -1603,16 +1751,24 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1603 return; 1751 return;
1604 1752
1605 /* Allocate buffer to track faults on a per-node basis */ 1753 /* Allocate buffer to track faults on a per-node basis */
1606 if (unlikely(!p->numa_faults)) { 1754 if (unlikely(!p->numa_faults_memory)) {
1607 int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; 1755 int size = sizeof(*p->numa_faults_memory) *
1756 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
1608 1757
1609 /* numa_faults and numa_faults_buffer share the allocation */ 1758 p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
1610 p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); 1759 if (!p->numa_faults_memory)
1611 if (!p->numa_faults)
1612 return; 1760 return;
1613 1761
1614 BUG_ON(p->numa_faults_buffer); 1762 BUG_ON(p->numa_faults_buffer_memory);
1615 p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); 1763 /*
1764 * The averaged statistics, shared & private, memory & cpu,
1765 * occupy the first half of the array. The second half of the
1766 * array is for current counters, which are averaged into the
1767 * first set by task_numa_placement.
1768 */
1769 p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
1770 p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
1771 p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
1616 p->total_numa_faults = 0; 1772 p->total_numa_faults = 0;
1617 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 1773 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1618 } 1774 }
@@ -1641,7 +1797,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1641 if (migrated) 1797 if (migrated)
1642 p->numa_pages_migrated += pages; 1798 p->numa_pages_migrated += pages;
1643 1799
1644 p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; 1800 p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
1801 p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
1645 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; 1802 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
1646} 1803}
1647 1804
@@ -2219,13 +2376,20 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
2219 se->avg.load_avg_contrib >>= NICE_0_SHIFT; 2376 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
2220 } 2377 }
2221} 2378}
2222#else 2379
2380static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2381{
2382 __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
2383 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
2384}
2385#else /* CONFIG_FAIR_GROUP_SCHED */
2223static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, 2386static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2224 int force_update) {} 2387 int force_update) {}
2225static inline void __update_tg_runnable_avg(struct sched_avg *sa, 2388static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2226 struct cfs_rq *cfs_rq) {} 2389 struct cfs_rq *cfs_rq) {}
2227static inline void __update_group_entity_contrib(struct sched_entity *se) {} 2390static inline void __update_group_entity_contrib(struct sched_entity *se) {}
2228#endif 2391static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2392#endif /* CONFIG_FAIR_GROUP_SCHED */
2229 2393
2230static inline void __update_task_entity_contrib(struct sched_entity *se) 2394static inline void __update_task_entity_contrib(struct sched_entity *se)
2231{ 2395{
@@ -2323,12 +2487,6 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
2323 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); 2487 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
2324} 2488}
2325 2489
2326static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2327{
2328 __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
2329 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
2330}
2331
2332/* Add the load generated by se into cfs_rq's child load-average */ 2490/* Add the load generated by se into cfs_rq's child load-average */
2333static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, 2491static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2334 struct sched_entity *se, 2492 struct sched_entity *se,
@@ -2416,7 +2574,10 @@ void idle_exit_fair(struct rq *this_rq)
2416 update_rq_runnable_avg(this_rq, 0); 2574 update_rq_runnable_avg(this_rq, 0);
2417} 2575}
2418 2576
2419#else 2577static int idle_balance(struct rq *this_rq);
2578
2579#else /* CONFIG_SMP */
2580
2420static inline void update_entity_load_avg(struct sched_entity *se, 2581static inline void update_entity_load_avg(struct sched_entity *se,
2421 int update_cfs_rq) {} 2582 int update_cfs_rq) {}
2422static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} 2583static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
@@ -2428,7 +2589,13 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2428 int sleep) {} 2589 int sleep) {}
2429static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, 2590static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
2430 int force_update) {} 2591 int force_update) {}
2431#endif 2592
2593static inline int idle_balance(struct rq *rq)
2594{
2595 return 0;
2596}
2597
2598#endif /* CONFIG_SMP */
2432 2599
2433static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 2600static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
2434{ 2601{
@@ -2578,10 +2745,10 @@ static void __clear_buddies_last(struct sched_entity *se)
2578{ 2745{
2579 for_each_sched_entity(se) { 2746 for_each_sched_entity(se) {
2580 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2747 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2581 if (cfs_rq->last == se) 2748 if (cfs_rq->last != se)
2582 cfs_rq->last = NULL;
2583 else
2584 break; 2749 break;
2750
2751 cfs_rq->last = NULL;
2585 } 2752 }
2586} 2753}
2587 2754
@@ -2589,10 +2756,10 @@ static void __clear_buddies_next(struct sched_entity *se)
2589{ 2756{
2590 for_each_sched_entity(se) { 2757 for_each_sched_entity(se) {
2591 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2758 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2592 if (cfs_rq->next == se) 2759 if (cfs_rq->next != se)
2593 cfs_rq->next = NULL;
2594 else
2595 break; 2760 break;
2761
2762 cfs_rq->next = NULL;
2596 } 2763 }
2597} 2764}
2598 2765
@@ -2600,10 +2767,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
2600{ 2767{
2601 for_each_sched_entity(se) { 2768 for_each_sched_entity(se) {
2602 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2769 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2603 if (cfs_rq->skip == se) 2770 if (cfs_rq->skip != se)
2604 cfs_rq->skip = NULL;
2605 else
2606 break; 2771 break;
2772
2773 cfs_rq->skip = NULL;
2607 } 2774 }
2608} 2775}
2609 2776
@@ -2746,17 +2913,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
2746 * 3) pick the "last" process, for cache locality 2913 * 3) pick the "last" process, for cache locality
2747 * 4) do not run the "skip" process, if something else is available 2914 * 4) do not run the "skip" process, if something else is available
2748 */ 2915 */
2749static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 2916static struct sched_entity *
2917pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2750{ 2918{
2751 struct sched_entity *se = __pick_first_entity(cfs_rq); 2919 struct sched_entity *left = __pick_first_entity(cfs_rq);
2752 struct sched_entity *left = se; 2920 struct sched_entity *se;
2921
2922 /*
2923 * If curr is set we have to see if its left of the leftmost entity
2924 * still in the tree, provided there was anything in the tree at all.
2925 */
2926 if (!left || (curr && entity_before(curr, left)))
2927 left = curr;
2928
2929 se = left; /* ideally we run the leftmost entity */
2753 2930
2754 /* 2931 /*
2755 * Avoid running the skip buddy, if running something else can 2932 * Avoid running the skip buddy, if running something else can
2756 * be done without getting too unfair. 2933 * be done without getting too unfair.
2757 */ 2934 */
2758 if (cfs_rq->skip == se) { 2935 if (cfs_rq->skip == se) {
2759 struct sched_entity *second = __pick_next_entity(se); 2936 struct sched_entity *second;
2937
2938 if (se == curr) {
2939 second = __pick_first_entity(cfs_rq);
2940 } else {
2941 second = __pick_next_entity(se);
2942 if (!second || (curr && entity_before(curr, second)))
2943 second = curr;
2944 }
2945
2760 if (second && wakeup_preempt_entity(second, left) < 1) 2946 if (second && wakeup_preempt_entity(second, left) < 1)
2761 se = second; 2947 se = second;
2762 } 2948 }
@@ -2778,7 +2964,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
2778 return se; 2964 return se;
2779} 2965}
2780 2966
2781static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); 2967static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2782 2968
2783static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) 2969static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
2784{ 2970{
@@ -3433,22 +3619,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3433} 3619}
3434 3620
3435/* conditionally throttle active cfs_rq's from put_prev_entity() */ 3621/* conditionally throttle active cfs_rq's from put_prev_entity() */
3436static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) 3622static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3437{ 3623{
3438 if (!cfs_bandwidth_used()) 3624 if (!cfs_bandwidth_used())
3439 return; 3625 return false;
3440 3626
3441 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) 3627 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
3442 return; 3628 return false;
3443 3629
3444 /* 3630 /*
3445 * it's possible for a throttled entity to be forced into a running 3631 * it's possible for a throttled entity to be forced into a running
3446 * state (e.g. set_curr_task), in this case we're finished. 3632 * state (e.g. set_curr_task), in this case we're finished.
3447 */ 3633 */
3448 if (cfs_rq_throttled(cfs_rq)) 3634 if (cfs_rq_throttled(cfs_rq))
3449 return; 3635 return true;
3450 3636
3451 throttle_cfs_rq(cfs_rq); 3637 throttle_cfs_rq(cfs_rq);
3638 return true;
3452} 3639}
3453 3640
3454static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) 3641static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -3558,7 +3745,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3558} 3745}
3559 3746
3560static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} 3747static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
3561static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3748static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
3562static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 3749static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
3563static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 3750static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3564 3751
@@ -4213,13 +4400,14 @@ done:
4213} 4400}
4214 4401
4215/* 4402/*
4216 * sched_balance_self: balance the current task (running on cpu) in domains 4403 * select_task_rq_fair: Select target runqueue for the waking task in domains
4217 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and 4404 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
4218 * SD_BALANCE_EXEC. 4405 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
4219 * 4406 *
4220 * Balance, ie. select the least loaded group. 4407 * Balances load by selecting the idlest cpu in the idlest group, or under
4408 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
4221 * 4409 *
4222 * Returns the target CPU number, or the same CPU if no balancing is needed. 4410 * Returns the target cpu number.
4223 * 4411 *
4224 * preempt must be disabled. 4412 * preempt must be disabled.
4225 */ 4413 */
@@ -4494,26 +4682,124 @@ preempt:
4494 set_last_buddy(se); 4682 set_last_buddy(se);
4495} 4683}
4496 4684
4497static struct task_struct *pick_next_task_fair(struct rq *rq) 4685static struct task_struct *
4686pick_next_task_fair(struct rq *rq, struct task_struct *prev)
4498{ 4687{
4499 struct task_struct *p;
4500 struct cfs_rq *cfs_rq = &rq->cfs; 4688 struct cfs_rq *cfs_rq = &rq->cfs;
4501 struct sched_entity *se; 4689 struct sched_entity *se;
4690 struct task_struct *p;
4691 int new_tasks;
4502 4692
4693again:
4694#ifdef CONFIG_FAIR_GROUP_SCHED
4503 if (!cfs_rq->nr_running) 4695 if (!cfs_rq->nr_running)
4504 return NULL; 4696 goto idle;
4697
4698 if (prev->sched_class != &fair_sched_class)
4699 goto simple;
4700
4701 /*
4702 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
4703 * likely that a next task is from the same cgroup as the current.
4704 *
4705 * Therefore attempt to avoid putting and setting the entire cgroup
4706 * hierarchy, only change the part that actually changes.
4707 */
4708
4709 do {
4710 struct sched_entity *curr = cfs_rq->curr;
4711
4712 /*
4713 * Since we got here without doing put_prev_entity() we also
4714 * have to consider cfs_rq->curr. If it is still a runnable
4715 * entity, update_curr() will update its vruntime, otherwise
4716 * forget we've ever seen it.
4717 */
4718 if (curr && curr->on_rq)
4719 update_curr(cfs_rq);
4720 else
4721 curr = NULL;
4722
4723 /*
4724 * This call to check_cfs_rq_runtime() will do the throttle and
4725 * dequeue its entity in the parent(s). Therefore the 'simple'
4726 * nr_running test will indeed be correct.
4727 */
4728 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
4729 goto simple;
4730
4731 se = pick_next_entity(cfs_rq, curr);
4732 cfs_rq = group_cfs_rq(se);
4733 } while (cfs_rq);
4734
4735 p = task_of(se);
4736
4737 /*
4738 * Since we haven't yet done put_prev_entity and if the selected task
4739 * is a different task than we started out with, try and touch the
4740 * least amount of cfs_rqs.
4741 */
4742 if (prev != p) {
4743 struct sched_entity *pse = &prev->se;
4744
4745 while (!(cfs_rq = is_same_group(se, pse))) {
4746 int se_depth = se->depth;
4747 int pse_depth = pse->depth;
4748
4749 if (se_depth <= pse_depth) {
4750 put_prev_entity(cfs_rq_of(pse), pse);
4751 pse = parent_entity(pse);
4752 }
4753 if (se_depth >= pse_depth) {
4754 set_next_entity(cfs_rq_of(se), se);
4755 se = parent_entity(se);
4756 }
4757 }
4758
4759 put_prev_entity(cfs_rq, pse);
4760 set_next_entity(cfs_rq, se);
4761 }
4762
4763 if (hrtick_enabled(rq))
4764 hrtick_start_fair(rq, p);
4765
4766 return p;
4767simple:
4768 cfs_rq = &rq->cfs;
4769#endif
4770
4771 if (!cfs_rq->nr_running)
4772 goto idle;
4773
4774 put_prev_task(rq, prev);
4505 4775
4506 do { 4776 do {
4507 se = pick_next_entity(cfs_rq); 4777 se = pick_next_entity(cfs_rq, NULL);
4508 set_next_entity(cfs_rq, se); 4778 set_next_entity(cfs_rq, se);
4509 cfs_rq = group_cfs_rq(se); 4779 cfs_rq = group_cfs_rq(se);
4510 } while (cfs_rq); 4780 } while (cfs_rq);
4511 4781
4512 p = task_of(se); 4782 p = task_of(se);
4783
4513 if (hrtick_enabled(rq)) 4784 if (hrtick_enabled(rq))
4514 hrtick_start_fair(rq, p); 4785 hrtick_start_fair(rq, p);
4515 4786
4516 return p; 4787 return p;
4788
4789idle:
4790 new_tasks = idle_balance(rq);
4791 /*
4792 * Because idle_balance() releases (and re-acquires) rq->lock, it is
4793 * possible for any higher priority task to appear. In that case we
4794 * must re-start the pick_next_entity() loop.
4795 */
4796 if (new_tasks < 0)
4797 return RETRY_TASK;
4798
4799 if (new_tasks > 0)
4800 goto again;
4801
4802 return NULL;
4517} 4803}
4518 4804
4519/* 4805/*
@@ -4751,7 +5037,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)
4751 * Is this task likely cache-hot: 5037 * Is this task likely cache-hot:
4752 */ 5038 */
4753static int 5039static int
4754task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) 5040task_hot(struct task_struct *p, u64 now)
4755{ 5041{
4756 s64 delta; 5042 s64 delta;
4757 5043
@@ -4785,7 +5071,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
4785{ 5071{
4786 int src_nid, dst_nid; 5072 int src_nid, dst_nid;
4787 5073
4788 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || 5074 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
4789 !(env->sd->flags & SD_NUMA)) { 5075 !(env->sd->flags & SD_NUMA)) {
4790 return false; 5076 return false;
4791 } 5077 }
@@ -4816,7 +5102,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
4816 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) 5102 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
4817 return false; 5103 return false;
4818 5104
4819 if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) 5105 if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
4820 return false; 5106 return false;
4821 5107
4822 src_nid = cpu_to_node(env->src_cpu); 5108 src_nid = cpu_to_node(env->src_cpu);
@@ -4912,7 +5198,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
4912 * 2) task is cache cold, or 5198 * 2) task is cache cold, or
4913 * 3) too many balance attempts have failed. 5199 * 3) too many balance attempts have failed.
4914 */ 5200 */
4915 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); 5201 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));
4916 if (!tsk_cache_hot) 5202 if (!tsk_cache_hot)
4917 tsk_cache_hot = migrate_degrades_locality(p, env); 5203 tsk_cache_hot = migrate_degrades_locality(p, env);
4918 5204
@@ -5775,12 +6061,10 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
5775 pwr_now /= SCHED_POWER_SCALE; 6061 pwr_now /= SCHED_POWER_SCALE;
5776 6062
5777 /* Amount of load we'd subtract */ 6063 /* Amount of load we'd subtract */
5778 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / 6064 if (busiest->avg_load > scaled_busy_load_per_task) {
5779 busiest->group_power;
5780 if (busiest->avg_load > tmp) {
5781 pwr_move += busiest->group_power * 6065 pwr_move += busiest->group_power *
5782 min(busiest->load_per_task, 6066 min(busiest->load_per_task,
5783 busiest->avg_load - tmp); 6067 busiest->avg_load - scaled_busy_load_per_task);
5784 } 6068 }
5785 6069
5786 /* Amount of load we'd add */ 6070 /* Amount of load we'd add */
@@ -6359,17 +6643,23 @@ out:
6359 * idle_balance is called by schedule() if this_cpu is about to become 6643 * idle_balance is called by schedule() if this_cpu is about to become
6360 * idle. Attempts to pull tasks from other CPUs. 6644 * idle. Attempts to pull tasks from other CPUs.
6361 */ 6645 */
6362void idle_balance(int this_cpu, struct rq *this_rq) 6646static int idle_balance(struct rq *this_rq)
6363{ 6647{
6364 struct sched_domain *sd; 6648 struct sched_domain *sd;
6365 int pulled_task = 0; 6649 int pulled_task = 0;
6366 unsigned long next_balance = jiffies + HZ; 6650 unsigned long next_balance = jiffies + HZ;
6367 u64 curr_cost = 0; 6651 u64 curr_cost = 0;
6652 int this_cpu = this_rq->cpu;
6368 6653
6654 idle_enter_fair(this_rq);
6655 /*
6656 * We must set idle_stamp _before_ calling idle_balance(), such that we
6657 * measure the duration of idle_balance() as idle time.
6658 */
6369 this_rq->idle_stamp = rq_clock(this_rq); 6659 this_rq->idle_stamp = rq_clock(this_rq);
6370 6660
6371 if (this_rq->avg_idle < sysctl_sched_migration_cost) 6661 if (this_rq->avg_idle < sysctl_sched_migration_cost)
6372 return; 6662 goto out;
6373 6663
6374 /* 6664 /*
6375 * Drop the rq->lock, but keep IRQ/preempt disabled. 6665 * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6407,15 +6697,22 @@ void idle_balance(int this_cpu, struct rq *this_rq)
6407 interval = msecs_to_jiffies(sd->balance_interval); 6697 interval = msecs_to_jiffies(sd->balance_interval);
6408 if (time_after(next_balance, sd->last_balance + interval)) 6698 if (time_after(next_balance, sd->last_balance + interval))
6409 next_balance = sd->last_balance + interval; 6699 next_balance = sd->last_balance + interval;
6410 if (pulled_task) { 6700 if (pulled_task)
6411 this_rq->idle_stamp = 0;
6412 break; 6701 break;
6413 }
6414 } 6702 }
6415 rcu_read_unlock(); 6703 rcu_read_unlock();
6416 6704
6417 raw_spin_lock(&this_rq->lock); 6705 raw_spin_lock(&this_rq->lock);
6418 6706
6707 /*
6708 * While browsing the domains, we released the rq lock.
6709 * A task could have be enqueued in the meantime
6710 */
6711 if (this_rq->cfs.h_nr_running && !pulled_task) {
6712 pulled_task = 1;
6713 goto out;
6714 }
6715
6419 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 6716 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
6420 /* 6717 /*
6421 * We are going idle. next_balance may be set based on 6718 * We are going idle. next_balance may be set based on
@@ -6426,6 +6723,20 @@ void idle_balance(int this_cpu, struct rq *this_rq)
6426 6723
6427 if (curr_cost > this_rq->max_idle_balance_cost) 6724 if (curr_cost > this_rq->max_idle_balance_cost)
6428 this_rq->max_idle_balance_cost = curr_cost; 6725 this_rq->max_idle_balance_cost = curr_cost;
6726
6727out:
6728 /* Is there a task of a high priority class? */
6729 if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
6730 (this_rq->dl.dl_nr_running ||
6731 (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
6732 pulled_task = -1;
6733
6734 if (pulled_task) {
6735 idle_exit_fair(this_rq);
6736 this_rq->idle_stamp = 0;
6737 }
6738
6739 return pulled_task;
6429} 6740}
6430 6741
6431/* 6742/*
@@ -6496,6 +6807,11 @@ out_unlock:
6496 return 0; 6807 return 0;
6497} 6808}
6498 6809
6810static inline int on_null_domain(struct rq *rq)
6811{
6812 return unlikely(!rcu_dereference_sched(rq->sd));
6813}
6814
6499#ifdef CONFIG_NO_HZ_COMMON 6815#ifdef CONFIG_NO_HZ_COMMON
6500/* 6816/*
6501 * idle load balancing details 6817 * idle load balancing details
@@ -6550,8 +6866,13 @@ static void nohz_balancer_kick(void)
6550static inline void nohz_balance_exit_idle(int cpu) 6866static inline void nohz_balance_exit_idle(int cpu)
6551{ 6867{
6552 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 6868 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
6553 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 6869 /*
6554 atomic_dec(&nohz.nr_cpus); 6870 * Completely isolated CPUs don't ever set, so we must test.
6871 */
6872 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
6873 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
6874 atomic_dec(&nohz.nr_cpus);
6875 }
6555 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 6876 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
6556 } 6877 }
6557} 6878}
@@ -6605,6 +6926,12 @@ void nohz_balance_enter_idle(int cpu)
6605 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) 6926 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
6606 return; 6927 return;
6607 6928
6929 /*
6930 * If we're a completely isolated CPU, we don't play.
6931 */
6932 if (on_null_domain(cpu_rq(cpu)))
6933 return;
6934
6608 cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 6935 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
6609 atomic_inc(&nohz.nr_cpus); 6936 atomic_inc(&nohz.nr_cpus);
6610 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 6937 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
@@ -6867,11 +7194,6 @@ static void run_rebalance_domains(struct softirq_action *h)
6867 nohz_idle_balance(this_rq, idle); 7194 nohz_idle_balance(this_rq, idle);
6868} 7195}
6869 7196
6870static inline int on_null_domain(struct rq *rq)
6871{
6872 return !rcu_dereference_sched(rq->sd);
6873}
6874
6875/* 7197/*
6876 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 7198 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
6877 */ 7199 */
@@ -7036,7 +7358,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
7036 */ 7358 */
7037static void switched_to_fair(struct rq *rq, struct task_struct *p) 7359static void switched_to_fair(struct rq *rq, struct task_struct *p)
7038{ 7360{
7039 if (!p->se.on_rq) 7361 struct sched_entity *se = &p->se;
7362#ifdef CONFIG_FAIR_GROUP_SCHED
7363 /*
7364 * Since the real-depth could have been changed (only FAIR
7365 * class maintain depth value), reset depth properly.
7366 */
7367 se->depth = se->parent ? se->parent->depth + 1 : 0;
7368#endif
7369 if (!se->on_rq)
7040 return; 7370 return;
7041 7371
7042 /* 7372 /*
@@ -7084,7 +7414,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
7084#ifdef CONFIG_FAIR_GROUP_SCHED 7414#ifdef CONFIG_FAIR_GROUP_SCHED
7085static void task_move_group_fair(struct task_struct *p, int on_rq) 7415static void task_move_group_fair(struct task_struct *p, int on_rq)
7086{ 7416{
7417 struct sched_entity *se = &p->se;
7087 struct cfs_rq *cfs_rq; 7418 struct cfs_rq *cfs_rq;
7419
7088 /* 7420 /*
7089 * If the task was not on the rq at the time of this cgroup movement 7421 * If the task was not on the rq at the time of this cgroup movement
7090 * it must have been asleep, sleeping tasks keep their ->vruntime 7422 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -7110,23 +7442,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
7110 * To prevent boost or penalty in the new cfs_rq caused by delta 7442 * To prevent boost or penalty in the new cfs_rq caused by delta
7111 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. 7443 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
7112 */ 7444 */
7113 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) 7445 if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
7114 on_rq = 1; 7446 on_rq = 1;
7115 7447
7116 if (!on_rq) 7448 if (!on_rq)
7117 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; 7449 se->vruntime -= cfs_rq_of(se)->min_vruntime;
7118 set_task_rq(p, task_cpu(p)); 7450 set_task_rq(p, task_cpu(p));
7451 se->depth = se->parent ? se->parent->depth + 1 : 0;
7119 if (!on_rq) { 7452 if (!on_rq) {
7120 cfs_rq = cfs_rq_of(&p->se); 7453 cfs_rq = cfs_rq_of(se);
7121 p->se.vruntime += cfs_rq->min_vruntime; 7454 se->vruntime += cfs_rq->min_vruntime;
7122#ifdef CONFIG_SMP 7455#ifdef CONFIG_SMP
7123 /* 7456 /*
7124 * migrate_task_rq_fair() will have removed our previous 7457 * migrate_task_rq_fair() will have removed our previous
7125 * contribution, but we must synchronize for ongoing future 7458 * contribution, but we must synchronize for ongoing future
7126 * decay. 7459 * decay.
7127 */ 7460 */
7128 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); 7461 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
7129 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; 7462 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
7130#endif 7463#endif
7131 } 7464 }
7132} 7465}
@@ -7222,10 +7555,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7222 if (!se) 7555 if (!se)
7223 return; 7556 return;
7224 7557
7225 if (!parent) 7558 if (!parent) {
7226 se->cfs_rq = &rq->cfs; 7559 se->cfs_rq = &rq->cfs;
7227 else 7560 se->depth = 0;
7561 } else {
7228 se->cfs_rq = parent->my_q; 7562 se->cfs_rq = parent->my_q;
7563 se->depth = parent->depth + 1;
7564 }
7229 7565
7230 se->my_q = cfs_rq; 7566 se->my_q = cfs_rq;
7231 /* guarantee group entities always have weight */ 7567 /* guarantee group entities always have weight */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
new file mode 100644
index 000000000000..8f4390a079c7
--- /dev/null
+++ b/kernel/sched/idle.c
@@ -0,0 +1,265 @@
1/*
2 * Generic entry point for the idle threads
3 */
4#include <linux/sched.h>
5#include <linux/cpu.h>
6#include <linux/cpuidle.h>
7#include <linux/tick.h>
8#include <linux/mm.h>
9#include <linux/stackprotector.h>
10
11#include <asm/tlb.h>
12
13#include <trace/events/power.h>
14
15static int __read_mostly cpu_idle_force_poll;
16
17void cpu_idle_poll_ctrl(bool enable)
18{
19 if (enable) {
20 cpu_idle_force_poll++;
21 } else {
22 cpu_idle_force_poll--;
23 WARN_ON_ONCE(cpu_idle_force_poll < 0);
24 }
25}
26
27#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
28static int __init cpu_idle_poll_setup(char *__unused)
29{
30 cpu_idle_force_poll = 1;
31 return 1;
32}
33__setup("nohlt", cpu_idle_poll_setup);
34
35static int __init cpu_idle_nopoll_setup(char *__unused)
36{
37 cpu_idle_force_poll = 0;
38 return 1;
39}
40__setup("hlt", cpu_idle_nopoll_setup);
41#endif
42
43static inline int cpu_idle_poll(void)
44{
45 rcu_idle_enter();
46 trace_cpu_idle_rcuidle(0, smp_processor_id());
47 local_irq_enable();
48 while (!tif_need_resched())
49 cpu_relax();
50 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
51 rcu_idle_exit();
52 return 1;
53}
54
55/* Weak implementations for optional arch specific functions */
56void __weak arch_cpu_idle_prepare(void) { }
57void __weak arch_cpu_idle_enter(void) { }
58void __weak arch_cpu_idle_exit(void) { }
59void __weak arch_cpu_idle_dead(void) { }
60void __weak arch_cpu_idle(void)
61{
62 cpu_idle_force_poll = 1;
63 local_irq_enable();
64}
65
66/**
67 * cpuidle_idle_call - the main idle function
68 *
69 * NOTE: no locks or semaphores should be used here
70 * return non-zero on failure
71 */
72static int cpuidle_idle_call(void)
73{
74 struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
75 struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
76 int next_state, entered_state, ret;
77 bool broadcast;
78
79 /*
80 * Check if the idle task must be rescheduled. If it is the
81 * case, exit the function after re-enabling the local irq and
82 * set again the polling flag
83 */
84 if (current_clr_polling_and_test()) {
85 local_irq_enable();
86 __current_set_polling();
87 return 0;
88 }
89
90 /*
91 * During the idle period, stop measuring the disabled irqs
92 * critical sections latencies
93 */
94 stop_critical_timings();
95
96 /*
97 * Tell the RCU framework we are entering an idle section,
98 * so no more rcu read side critical sections and one more
99 * step to the grace period
100 */
101 rcu_idle_enter();
102
103 /*
104 * Check if the cpuidle framework is ready, otherwise fallback
105 * to the default arch specific idle method
106 */
107 ret = cpuidle_enabled(drv, dev);
108
109 if (!ret) {
110 /*
111 * Ask the governor to choose an idle state it thinks
112 * it is convenient to go to. There is *always* a
113 * convenient idle state
114 */
115 next_state = cpuidle_select(drv, dev);
116
117 /*
118 * The idle task must be scheduled, it is pointless to
119 * go to idle, just update no idle residency and get
120 * out of this function
121 */
122 if (current_clr_polling_and_test()) {
123 dev->last_residency = 0;
124 entered_state = next_state;
125 local_irq_enable();
126 } else {
127 broadcast = !!(drv->states[next_state].flags &
128 CPUIDLE_FLAG_TIMER_STOP);
129
130 if (broadcast)
131 /*
132 * Tell the time framework to switch
133 * to a broadcast timer because our
134 * local timer will be shutdown. If a
135 * local timer is used from another
136 * cpu as a broadcast timer, this call
137 * may fail if it is not available
138 */
139 ret = clockevents_notify(
140 CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
141 &dev->cpu);
142
143 if (!ret) {
144 trace_cpu_idle_rcuidle(next_state, dev->cpu);
145
146 /*
147 * Enter the idle state previously
148 * returned by the governor
149 * decision. This function will block
150 * until an interrupt occurs and will
151 * take care of re-enabling the local
152 * interrupts
153 */
154 entered_state = cpuidle_enter(drv, dev,
155 next_state);
156
157 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT,
158 dev->cpu);
159
160 if (broadcast)
161 clockevents_notify(
162 CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
163 &dev->cpu);
164
165 /*
166 * Give the governor an opportunity to reflect on the
167 * outcome
168 */
169 cpuidle_reflect(dev, entered_state);
170 }
171 }
172 }
173
174 /*
175 * We can't use the cpuidle framework, let's use the default
176 * idle routine
177 */
178 if (ret)
179 arch_cpu_idle();
180
181 __current_set_polling();
182
183 /*
184 * It is up to the idle functions to enable back the local
185 * interrupt
186 */
187 if (WARN_ON_ONCE(irqs_disabled()))
188 local_irq_enable();
189
190 rcu_idle_exit();
191 start_critical_timings();
192
193 return 0;
194}
195
196/*
197 * Generic idle loop implementation
198 */
199static void cpu_idle_loop(void)
200{
201 while (1) {
202 tick_nohz_idle_enter();
203
204 while (!need_resched()) {
205 check_pgt_cache();
206 rmb();
207
208 if (cpu_is_offline(smp_processor_id()))
209 arch_cpu_idle_dead();
210
211 local_irq_disable();
212 arch_cpu_idle_enter();
213
214 /*
215 * In poll mode we reenable interrupts and spin.
216 *
217 * Also if we detected in the wakeup from idle
218 * path that the tick broadcast device expired
219 * for us, we don't want to go deep idle as we
220 * know that the IPI is going to arrive right
221 * away
222 */
223 if (cpu_idle_force_poll || tick_check_broadcast_expired())
224 cpu_idle_poll();
225 else
226 cpuidle_idle_call();
227
228 arch_cpu_idle_exit();
229 }
230
231 /*
232 * Since we fell out of the loop above, we know
233 * TIF_NEED_RESCHED must be set, propagate it into
234 * PREEMPT_NEED_RESCHED.
235 *
236 * This is required because for polling idle loops we will
237 * not have had an IPI to fold the state for us.
238 */
239 preempt_set_need_resched();
240 tick_nohz_idle_exit();
241 schedule_preempt_disabled();
242 }
243}
244
245void cpu_startup_entry(enum cpuhp_state state)
246{
247 /*
248 * This #ifdef needs to die, but it's too late in the cycle to
249 * make this generic (arm and sh have never invoked the canary
250 * init for the non boot cpus!). Will be fixed in 3.11
251 */
252#ifdef CONFIG_X86
253 /*
254 * If we're the non-boot CPU, nothing set the stack canary up
255 * for us. The boot CPU already has it initialized but no harm
256 * in doing it again. This is a good place for updating it, as
257 * we wont ever return from this function (so the invalid
258 * canaries already on the stack wont ever trigger).
259 */
260 boot_init_stack_canary();
261#endif
262 __current_set_polling();
263 arch_cpu_idle_prepare();
264 cpu_idle_loop();
265}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 516c3d9ceea1..879f2b75266a 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,18 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
13{ 13{
14 return task_cpu(p); /* IDLE tasks as never migrated */ 14 return task_cpu(p); /* IDLE tasks as never migrated */
15} 15}
16
17static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
18{
19 idle_exit_fair(rq);
20 rq_last_tick_reset(rq);
21}
22
23static void post_schedule_idle(struct rq *rq)
24{
25 idle_enter_fair(rq);
26}
27#endif /* CONFIG_SMP */ 16#endif /* CONFIG_SMP */
17
28/* 18/*
29 * Idle tasks are unconditionally rescheduled: 19 * Idle tasks are unconditionally rescheduled:
30 */ 20 */
@@ -33,13 +23,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
33 resched_task(rq->idle); 23 resched_task(rq->idle);
34} 24}
35 25
36static struct task_struct *pick_next_task_idle(struct rq *rq) 26static struct task_struct *
27pick_next_task_idle(struct rq *rq, struct task_struct *prev)
37{ 28{
29 put_prev_task(rq, prev);
30
38 schedstat_inc(rq, sched_goidle); 31 schedstat_inc(rq, sched_goidle);
39#ifdef CONFIG_SMP
40 /* Trigger the post schedule to do an idle_enter for CFS */
41 rq->post_schedule = 1;
42#endif
43 return rq->idle; 32 return rq->idle;
44} 33}
45 34
@@ -58,6 +47,8 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
58 47
59static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) 48static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
60{ 49{
50 idle_exit_fair(rq);
51 rq_last_tick_reset(rq);
61} 52}
62 53
63static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 54static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
@@ -101,8 +92,6 @@ const struct sched_class idle_sched_class = {
101 92
102#ifdef CONFIG_SMP 93#ifdef CONFIG_SMP
103 .select_task_rq = select_task_rq_idle, 94 .select_task_rq = select_task_rq_idle,
104 .pre_schedule = pre_schedule_idle,
105 .post_schedule = post_schedule_idle,
106#endif 95#endif
107 96
108 .set_curr_task = set_curr_task_idle, 97 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 1999021042c7..d8cdf1618551 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -229,6 +229,14 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
229 229
230#ifdef CONFIG_SMP 230#ifdef CONFIG_SMP
231 231
232static int pull_rt_task(struct rq *this_rq);
233
234static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
235{
236 /* Try to pull RT tasks here if we lower this rq's prio */
237 return rq->rt.highest_prio.curr > prev->prio;
238}
239
232static inline int rt_overloaded(struct rq *rq) 240static inline int rt_overloaded(struct rq *rq)
233{ 241{
234 return atomic_read(&rq->rd->rto_count); 242 return atomic_read(&rq->rd->rto_count);
@@ -315,6 +323,15 @@ static inline int has_pushable_tasks(struct rq *rq)
315 return !plist_head_empty(&rq->rt.pushable_tasks); 323 return !plist_head_empty(&rq->rt.pushable_tasks);
316} 324}
317 325
326static inline void set_post_schedule(struct rq *rq)
327{
328 /*
329 * We detect this state here so that we can avoid taking the RQ
330 * lock again later if there is no need to push
331 */
332 rq->post_schedule = has_pushable_tasks(rq);
333}
334
318static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 335static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
319{ 336{
320 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 337 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
@@ -359,6 +376,19 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
359{ 376{
360} 377}
361 378
379static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
380{
381 return false;
382}
383
384static inline int pull_rt_task(struct rq *this_rq)
385{
386 return 0;
387}
388
389static inline void set_post_schedule(struct rq *rq)
390{
391}
362#endif /* CONFIG_SMP */ 392#endif /* CONFIG_SMP */
363 393
364static inline int on_rt_rq(struct sched_rt_entity *rt_se) 394static inline int on_rt_rq(struct sched_rt_entity *rt_se)
@@ -440,11 +470,6 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
440 dequeue_rt_entity(rt_se); 470 dequeue_rt_entity(rt_se);
441} 471}
442 472
443static inline int rt_rq_throttled(struct rt_rq *rt_rq)
444{
445 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
446}
447
448static int rt_se_boosted(struct sched_rt_entity *rt_se) 473static int rt_se_boosted(struct sched_rt_entity *rt_se)
449{ 474{
450 struct rt_rq *rt_rq = group_rt_rq(rt_se); 475 struct rt_rq *rt_rq = group_rt_rq(rt_se);
@@ -515,11 +540,6 @@ static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
515{ 540{
516} 541}
517 542
518static inline int rt_rq_throttled(struct rt_rq *rt_rq)
519{
520 return rt_rq->rt_throttled;
521}
522
523static inline const struct cpumask *sched_rt_period_mask(void) 543static inline const struct cpumask *sched_rt_period_mask(void)
524{ 544{
525 return cpu_online_mask; 545 return cpu_online_mask;
@@ -1318,15 +1338,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1318{ 1338{
1319 struct sched_rt_entity *rt_se; 1339 struct sched_rt_entity *rt_se;
1320 struct task_struct *p; 1340 struct task_struct *p;
1321 struct rt_rq *rt_rq; 1341 struct rt_rq *rt_rq = &rq->rt;
1322
1323 rt_rq = &rq->rt;
1324
1325 if (!rt_rq->rt_nr_running)
1326 return NULL;
1327
1328 if (rt_rq_throttled(rt_rq))
1329 return NULL;
1330 1342
1331 do { 1343 do {
1332 rt_se = pick_next_rt_entity(rq, rt_rq); 1344 rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1340,21 +1352,45 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1340 return p; 1352 return p;
1341} 1353}
1342 1354
1343static struct task_struct *pick_next_task_rt(struct rq *rq) 1355static struct task_struct *
1356pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1344{ 1357{
1345 struct task_struct *p = _pick_next_task_rt(rq); 1358 struct task_struct *p;
1359 struct rt_rq *rt_rq = &rq->rt;
1360
1361 if (need_pull_rt_task(rq, prev)) {
1362 pull_rt_task(rq);
1363 /*
1364 * pull_rt_task() can drop (and re-acquire) rq->lock; this
1365 * means a dl task can slip in, in which case we need to
1366 * re-start task selection.
1367 */
1368 if (unlikely(rq->dl.dl_nr_running))
1369 return RETRY_TASK;
1370 }
1371
1372 /*
1373 * We may dequeue prev's rt_rq in put_prev_task().
1374 * So, we update time before rt_nr_running check.
1375 */
1376 if (prev->sched_class == &rt_sched_class)
1377 update_curr_rt(rq);
1378
1379 if (!rt_rq->rt_nr_running)
1380 return NULL;
1381
1382 if (rt_rq_throttled(rt_rq))
1383 return NULL;
1384
1385 put_prev_task(rq, prev);
1386
1387 p = _pick_next_task_rt(rq);
1346 1388
1347 /* The running task is never eligible for pushing */ 1389 /* The running task is never eligible for pushing */
1348 if (p) 1390 if (p)
1349 dequeue_pushable_task(rq, p); 1391 dequeue_pushable_task(rq, p);
1350 1392
1351#ifdef CONFIG_SMP 1393 set_post_schedule(rq);
1352 /*
1353 * We detect this state here so that we can avoid taking the RQ
1354 * lock again later if there is no need to push
1355 */
1356 rq->post_schedule = has_pushable_tasks(rq);
1357#endif
1358 1394
1359 return p; 1395 return p;
1360} 1396}
@@ -1724,13 +1760,6 @@ skip:
1724 return ret; 1760 return ret;
1725} 1761}
1726 1762
1727static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1728{
1729 /* Try to pull RT tasks here if we lower this rq's prio */
1730 if (rq->rt.highest_prio.curr > prev->prio)
1731 pull_rt_task(rq);
1732}
1733
1734static void post_schedule_rt(struct rq *rq) 1763static void post_schedule_rt(struct rq *rq)
1735{ 1764{
1736 push_rt_tasks(rq); 1765 push_rt_tasks(rq);
@@ -1833,7 +1862,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1833 resched_task(rq->curr); 1862 resched_task(rq->curr);
1834} 1863}
1835 1864
1836void init_sched_rt_class(void) 1865void __init init_sched_rt_class(void)
1837{ 1866{
1838 unsigned int i; 1867 unsigned int i;
1839 1868
@@ -2007,7 +2036,6 @@ const struct sched_class rt_sched_class = {
2007 .set_cpus_allowed = set_cpus_allowed_rt, 2036 .set_cpus_allowed = set_cpus_allowed_rt,
2008 .rq_online = rq_online_rt, 2037 .rq_online = rq_online_rt,
2009 .rq_offline = rq_offline_rt, 2038 .rq_offline = rq_offline_rt,
2010 .pre_schedule = pre_schedule_rt,
2011 .post_schedule = post_schedule_rt, 2039 .post_schedule = post_schedule_rt,
2012 .task_woken = task_woken_rt, 2040 .task_woken = task_woken_rt,
2013 .switched_from = switched_from_rt, 2041 .switched_from = switched_from_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f964add50f38..c9007f28d3a2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -24,24 +24,6 @@ extern long calc_load_fold_active(struct rq *this_rq);
24extern void update_cpu_load_active(struct rq *this_rq); 24extern void update_cpu_load_active(struct rq *this_rq);
25 25
26/* 26/*
27 * Convert user-nice values [ -20 ... 0 ... 19 ]
28 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
29 * and back.
30 */
31#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
32#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
33#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
34
35/*
36 * 'User priority' is the nice value converted to something we
37 * can work with better when scaling various scheduler parameters,
38 * it's a [ 0 ... 39 ] range.
39 */
40#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
41#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
42#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
43
44/*
45 * Helpers for converting nanosecond timing to jiffy resolution 27 * Helpers for converting nanosecond timing to jiffy resolution
46 */ 28 */
47#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 29#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
@@ -441,6 +423,18 @@ struct rt_rq {
441#endif 423#endif
442}; 424};
443 425
426#ifdef CONFIG_RT_GROUP_SCHED
427static inline int rt_rq_throttled(struct rt_rq *rt_rq)
428{
429 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
430}
431#else
432static inline int rt_rq_throttled(struct rt_rq *rt_rq)
433{
434 return rt_rq->rt_throttled;
435}
436#endif
437
444/* Deadline class' related fields in a runqueue */ 438/* Deadline class' related fields in a runqueue */
445struct dl_rq { 439struct dl_rq {
446 /* runqueue is an rbtree, ordered by deadline */ 440 /* runqueue is an rbtree, ordered by deadline */
@@ -558,11 +552,9 @@ struct rq {
558#ifdef CONFIG_FAIR_GROUP_SCHED 552#ifdef CONFIG_FAIR_GROUP_SCHED
559 /* list of leaf cfs_rq on this cpu: */ 553 /* list of leaf cfs_rq on this cpu: */
560 struct list_head leaf_cfs_rq_list; 554 struct list_head leaf_cfs_rq_list;
561#endif /* CONFIG_FAIR_GROUP_SCHED */
562 555
563#ifdef CONFIG_RT_GROUP_SCHED 556 struct sched_avg avg;
564 struct list_head leaf_rt_rq_list; 557#endif /* CONFIG_FAIR_GROUP_SCHED */
565#endif
566 558
567 /* 559 /*
568 * This is part of a global counter where only the total sum 560 * This is part of a global counter where only the total sum
@@ -651,8 +643,6 @@ struct rq {
651#ifdef CONFIG_SMP 643#ifdef CONFIG_SMP
652 struct llist_head wake_list; 644 struct llist_head wake_list;
653#endif 645#endif
654
655 struct sched_avg avg;
656}; 646};
657 647
658static inline int cpu_of(struct rq *rq) 648static inline int cpu_of(struct rq *rq)
@@ -1112,6 +1102,8 @@ static const u32 prio_to_wmult[40] = {
1112 1102
1113#define DEQUEUE_SLEEP 1 1103#define DEQUEUE_SLEEP 1
1114 1104
1105#define RETRY_TASK ((void *)-1UL)
1106
1115struct sched_class { 1107struct sched_class {
1116 const struct sched_class *next; 1108 const struct sched_class *next;
1117 1109
@@ -1122,14 +1114,22 @@ struct sched_class {
1122 1114
1123 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); 1115 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
1124 1116
1125 struct task_struct * (*pick_next_task) (struct rq *rq); 1117 /*
1118 * It is the responsibility of the pick_next_task() method that will
1119 * return the next task to call put_prev_task() on the @prev task or
1120 * something equivalent.
1121 *
1122 * May return RETRY_TASK when it finds a higher prio class has runnable
1123 * tasks.
1124 */
1125 struct task_struct * (*pick_next_task) (struct rq *rq,
1126 struct task_struct *prev);
1126 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1127 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
1127 1128
1128#ifdef CONFIG_SMP 1129#ifdef CONFIG_SMP
1129 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1130 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
1130 void (*migrate_task_rq)(struct task_struct *p, int next_cpu); 1131 void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
1131 1132
1132 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
1133 void (*post_schedule) (struct rq *this_rq); 1133 void (*post_schedule) (struct rq *this_rq);
1134 void (*task_waking) (struct task_struct *task); 1134 void (*task_waking) (struct task_struct *task);
1135 void (*task_woken) (struct rq *this_rq, struct task_struct *task); 1135 void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1159,6 +1159,11 @@ struct sched_class {
1159#endif 1159#endif
1160}; 1160};
1161 1161
1162static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
1163{
1164 prev->sched_class->put_prev_task(rq, prev);
1165}
1166
1162#define sched_class_highest (&stop_sched_class) 1167#define sched_class_highest (&stop_sched_class)
1163#define for_each_class(class) \ 1168#define for_each_class(class) \
1164 for (class = sched_class_highest; class; class = class->next) 1169 for (class = sched_class_highest; class; class = class->next)
@@ -1175,16 +1180,14 @@ extern const struct sched_class idle_sched_class;
1175extern void update_group_power(struct sched_domain *sd, int cpu); 1180extern void update_group_power(struct sched_domain *sd, int cpu);
1176 1181
1177extern void trigger_load_balance(struct rq *rq); 1182extern void trigger_load_balance(struct rq *rq);
1178extern void idle_balance(int this_cpu, struct rq *this_rq);
1179 1183
1180extern void idle_enter_fair(struct rq *this_rq); 1184extern void idle_enter_fair(struct rq *this_rq);
1181extern void idle_exit_fair(struct rq *this_rq); 1185extern void idle_exit_fair(struct rq *this_rq);
1182 1186
1183#else /* CONFIG_SMP */ 1187#else
1184 1188
1185static inline void idle_balance(int cpu, struct rq *rq) 1189static inline void idle_enter_fair(struct rq *rq) { }
1186{ 1190static inline void idle_exit_fair(struct rq *rq) { }
1187}
1188 1191
1189#endif 1192#endif
1190 1193
@@ -1213,16 +1216,6 @@ extern void update_idle_cpu_load(struct rq *this_rq);
1213 1216
1214extern void init_task_runnable_average(struct task_struct *p); 1217extern void init_task_runnable_average(struct task_struct *p);
1215 1218
1216#ifdef CONFIG_PARAVIRT
1217static inline u64 steal_ticks(u64 steal)
1218{
1219 if (unlikely(steal > NSEC_PER_SEC))
1220 return div_u64(steal, TICK_NSEC);
1221
1222 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
1223}
1224#endif
1225
1226static inline void inc_nr_running(struct rq *rq) 1219static inline void inc_nr_running(struct rq *rq)
1227{ 1220{
1228 rq->nr_running++; 1221 rq->nr_running++;
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index da98af347e8b..a476bea17fbc 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -142,4 +142,4 @@ static int __init proc_schedstat_init(void)
142 proc_create("schedstat", 0, NULL, &proc_schedstat_operations); 142 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
143 return 0; 143 return 0;
144} 144}
145module_init(proc_schedstat_init); 145subsys_initcall(proc_schedstat_init);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index fdb6bb0b3356..d6ce65dde541 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,16 +23,19 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
23 /* we're never preempted */ 23 /* we're never preempted */
24} 24}
25 25
26static struct task_struct *pick_next_task_stop(struct rq *rq) 26static struct task_struct *
27pick_next_task_stop(struct rq *rq, struct task_struct *prev)
27{ 28{
28 struct task_struct *stop = rq->stop; 29 struct task_struct *stop = rq->stop;
29 30
30 if (stop && stop->on_rq) { 31 if (!stop || !stop->on_rq)
31 stop->se.exec_start = rq_clock_task(rq); 32 return NULL;
32 return stop;
33 }
34 33
35 return NULL; 34 put_prev_task(rq, prev);
35
36 stop->se.exec_start = rq_clock_task(rq);
37
38 return stop;
36} 39}
37 40
38static void 41static void
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index b7a10048a32c..fd609bd9d6dd 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -55,60 +55,33 @@ struct seccomp_filter {
55 atomic_t usage; 55 atomic_t usage;
56 struct seccomp_filter *prev; 56 struct seccomp_filter *prev;
57 unsigned short len; /* Instruction count */ 57 unsigned short len; /* Instruction count */
58 struct sock_filter insns[]; 58 struct sock_filter_int insnsi[];
59}; 59};
60 60
61/* Limit any path through the tree to 256KB worth of instructions. */ 61/* Limit any path through the tree to 256KB worth of instructions. */
62#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) 62#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
63 63
64/** 64/*
65 * get_u32 - returns a u32 offset into data
66 * @data: a unsigned 64 bit value
67 * @index: 0 or 1 to return the first or second 32-bits
68 *
69 * This inline exists to hide the length of unsigned long. If a 32-bit
70 * unsigned long is passed in, it will be extended and the top 32-bits will be
71 * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
72 * properly returned.
73 *
74 * Endianness is explicitly ignored and left for BPF program authors to manage 65 * Endianness is explicitly ignored and left for BPF program authors to manage
75 * as per the specific architecture. 66 * as per the specific architecture.
76 */ 67 */
77static inline u32 get_u32(u64 data, int index) 68static void populate_seccomp_data(struct seccomp_data *sd)
78{ 69{
79 return ((u32 *)&data)[index]; 70 struct task_struct *task = current;
80} 71 struct pt_regs *regs = task_pt_regs(task);
81 72
82/* Helper for bpf_load below. */ 73 sd->nr = syscall_get_nr(task, regs);
83#define BPF_DATA(_name) offsetof(struct seccomp_data, _name) 74 sd->arch = syscall_get_arch(task, regs);
84/** 75
85 * bpf_load: checks and returns a pointer to the requested offset 76 /* Unroll syscall_get_args to help gcc on arm. */
86 * @off: offset into struct seccomp_data to load from 77 syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]);
87 * 78 syscall_get_arguments(task, regs, 1, 1, (unsigned long *) &sd->args[1]);
88 * Returns the requested 32-bits of data. 79 syscall_get_arguments(task, regs, 2, 1, (unsigned long *) &sd->args[2]);
89 * seccomp_check_filter() should assure that @off is 32-bit aligned 80 syscall_get_arguments(task, regs, 3, 1, (unsigned long *) &sd->args[3]);
90 * and not out of bounds. Failure to do so is a BUG. 81 syscall_get_arguments(task, regs, 4, 1, (unsigned long *) &sd->args[4]);
91 */ 82 syscall_get_arguments(task, regs, 5, 1, (unsigned long *) &sd->args[5]);
92u32 seccomp_bpf_load(int off) 83
93{ 84 sd->instruction_pointer = KSTK_EIP(task);
94 struct pt_regs *regs = task_pt_regs(current);
95 if (off == BPF_DATA(nr))
96 return syscall_get_nr(current, regs);
97 if (off == BPF_DATA(arch))
98 return syscall_get_arch(current, regs);
99 if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
100 unsigned long value;
101 int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
102 int index = !!(off % sizeof(u64));
103 syscall_get_arguments(current, regs, arg, 1, &value);
104 return get_u32(value, index);
105 }
106 if (off == BPF_DATA(instruction_pointer))
107 return get_u32(KSTK_EIP(current), 0);
108 if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
109 return get_u32(KSTK_EIP(current), 1);
110 /* seccomp_check_filter should make this impossible. */
111 BUG();
112} 85}
113 86
114/** 87/**
@@ -133,17 +106,17 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
133 106
134 switch (code) { 107 switch (code) {
135 case BPF_S_LD_W_ABS: 108 case BPF_S_LD_W_ABS:
136 ftest->code = BPF_S_ANC_SECCOMP_LD_W; 109 ftest->code = BPF_LDX | BPF_W | BPF_ABS;
137 /* 32-bit aligned and not out of bounds. */ 110 /* 32-bit aligned and not out of bounds. */
138 if (k >= sizeof(struct seccomp_data) || k & 3) 111 if (k >= sizeof(struct seccomp_data) || k & 3)
139 return -EINVAL; 112 return -EINVAL;
140 continue; 113 continue;
141 case BPF_S_LD_W_LEN: 114 case BPF_S_LD_W_LEN:
142 ftest->code = BPF_S_LD_IMM; 115 ftest->code = BPF_LD | BPF_IMM;
143 ftest->k = sizeof(struct seccomp_data); 116 ftest->k = sizeof(struct seccomp_data);
144 continue; 117 continue;
145 case BPF_S_LDX_W_LEN: 118 case BPF_S_LDX_W_LEN:
146 ftest->code = BPF_S_LDX_IMM; 119 ftest->code = BPF_LDX | BPF_IMM;
147 ftest->k = sizeof(struct seccomp_data); 120 ftest->k = sizeof(struct seccomp_data);
148 continue; 121 continue;
149 /* Explicitly include allowed calls. */ 122 /* Explicitly include allowed calls. */
@@ -185,6 +158,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
185 case BPF_S_JMP_JGT_X: 158 case BPF_S_JMP_JGT_X:
186 case BPF_S_JMP_JSET_K: 159 case BPF_S_JMP_JSET_K:
187 case BPF_S_JMP_JSET_X: 160 case BPF_S_JMP_JSET_X:
161 sk_decode_filter(ftest, ftest);
188 continue; 162 continue;
189 default: 163 default:
190 return -EINVAL; 164 return -EINVAL;
@@ -202,18 +176,21 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
202static u32 seccomp_run_filters(int syscall) 176static u32 seccomp_run_filters(int syscall)
203{ 177{
204 struct seccomp_filter *f; 178 struct seccomp_filter *f;
179 struct seccomp_data sd;
205 u32 ret = SECCOMP_RET_ALLOW; 180 u32 ret = SECCOMP_RET_ALLOW;
206 181
207 /* Ensure unexpected behavior doesn't result in failing open. */ 182 /* Ensure unexpected behavior doesn't result in failing open. */
208 if (WARN_ON(current->seccomp.filter == NULL)) 183 if (WARN_ON(current->seccomp.filter == NULL))
209 return SECCOMP_RET_KILL; 184 return SECCOMP_RET_KILL;
210 185
186 populate_seccomp_data(&sd);
187
211 /* 188 /*
212 * All filters in the list are evaluated and the lowest BPF return 189 * All filters in the list are evaluated and the lowest BPF return
213 * value always takes priority (ignoring the DATA). 190 * value always takes priority (ignoring the DATA).
214 */ 191 */
215 for (f = current->seccomp.filter; f; f = f->prev) { 192 for (f = current->seccomp.filter; f; f = f->prev) {
216 u32 cur_ret = sk_run_filter(NULL, f->insns); 193 u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi);
217 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) 194 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
218 ret = cur_ret; 195 ret = cur_ret;
219 } 196 }
@@ -231,6 +208,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
231 struct seccomp_filter *filter; 208 struct seccomp_filter *filter;
232 unsigned long fp_size = fprog->len * sizeof(struct sock_filter); 209 unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
233 unsigned long total_insns = fprog->len; 210 unsigned long total_insns = fprog->len;
211 struct sock_filter *fp;
212 int new_len;
234 long ret; 213 long ret;
235 214
236 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) 215 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
@@ -252,28 +231,43 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
252 CAP_SYS_ADMIN) != 0) 231 CAP_SYS_ADMIN) != 0)
253 return -EACCES; 232 return -EACCES;
254 233
255 /* Allocate a new seccomp_filter */ 234 fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
256 filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, 235 if (!fp)
257 GFP_KERNEL|__GFP_NOWARN);
258 if (!filter)
259 return -ENOMEM; 236 return -ENOMEM;
260 atomic_set(&filter->usage, 1);
261 filter->len = fprog->len;
262 237
263 /* Copy the instructions from fprog. */ 238 /* Copy the instructions from fprog. */
264 ret = -EFAULT; 239 ret = -EFAULT;
265 if (copy_from_user(filter->insns, fprog->filter, fp_size)) 240 if (copy_from_user(fp, fprog->filter, fp_size))
266 goto fail; 241 goto free_prog;
267 242
268 /* Check and rewrite the fprog via the skb checker */ 243 /* Check and rewrite the fprog via the skb checker */
269 ret = sk_chk_filter(filter->insns, filter->len); 244 ret = sk_chk_filter(fp, fprog->len);
270 if (ret) 245 if (ret)
271 goto fail; 246 goto free_prog;
272 247
273 /* Check and rewrite the fprog for seccomp use */ 248 /* Check and rewrite the fprog for seccomp use */
274 ret = seccomp_check_filter(filter->insns, filter->len); 249 ret = seccomp_check_filter(fp, fprog->len);
250 if (ret)
251 goto free_prog;
252
253 /* Convert 'sock_filter' insns to 'sock_filter_int' insns */
254 ret = sk_convert_filter(fp, fprog->len, NULL, &new_len);
255 if (ret)
256 goto free_prog;
257
258 /* Allocate a new seccomp_filter */
259 filter = kzalloc(sizeof(struct seccomp_filter) +
260 sizeof(struct sock_filter_int) * new_len,
261 GFP_KERNEL|__GFP_NOWARN);
262 if (!filter)
263 goto free_prog;
264
265 ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len);
275 if (ret) 266 if (ret)
276 goto fail; 267 goto free_filter;
268
269 atomic_set(&filter->usage, 1);
270 filter->len = new_len;
277 271
278 /* 272 /*
279 * If there is an existing filter, make it the prev and don't drop its 273 * If there is an existing filter, make it the prev and don't drop its
@@ -282,8 +276,11 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
282 filter->prev = current->seccomp.filter; 276 filter->prev = current->seccomp.filter;
283 current->seccomp.filter = filter; 277 current->seccomp.filter = filter;
284 return 0; 278 return 0;
285fail: 279
280free_filter:
286 kfree(filter); 281 kfree(filter);
282free_prog:
283 kfree(fp);
287 return ret; 284 return ret;
288} 285}
289 286
@@ -293,7 +290,7 @@ fail:
293 * 290 *
294 * Returns 0 on success and non-zero otherwise. 291 * Returns 0 on success and non-zero otherwise.
295 */ 292 */
296long seccomp_attach_user_filter(char __user *user_filter) 293static long seccomp_attach_user_filter(char __user *user_filter)
297{ 294{
298 struct sock_fprog fprog; 295 struct sock_fprog fprog;
299 long ret = -EFAULT; 296 long ret = -EFAULT;
diff --git a/kernel/signal.c b/kernel/signal.c
index 52f881db1ca0..5d4b05a229a6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2382,7 +2382,7 @@ relock:
2382 * @regs: user register state 2382 * @regs: user register state
2383 * @stepping: nonzero if debugger single-step or block-step in use 2383 * @stepping: nonzero if debugger single-step or block-step in use
2384 * 2384 *
2385 * This function should be called when a signal has succesfully been 2385 * This function should be called when a signal has successfully been
2386 * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask 2386 * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask
2387 * is always blocked, and the signal itself is blocked unless %SA_NODEFER 2387 * is always blocked, and the signal itself is blocked unless %SA_NODEFER
2388 * is set in @ka->sa.sa_flags. Tracing is notified. 2388 * is set in @ka->sa.sa_flags. Tracing is notified.
diff --git a/kernel/smp.c b/kernel/smp.c
index ffee35bef179..06d574e42c72 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -117,13 +117,43 @@ static void csd_unlock(struct call_single_data *csd)
117 csd->flags &= ~CSD_FLAG_LOCK; 117 csd->flags &= ~CSD_FLAG_LOCK;
118} 118}
119 119
120static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
121
120/* 122/*
121 * Insert a previously allocated call_single_data element 123 * Insert a previously allocated call_single_data element
122 * for execution on the given CPU. data must already have 124 * for execution on the given CPU. data must already have
123 * ->func, ->info, and ->flags set. 125 * ->func, ->info, and ->flags set.
124 */ 126 */
125static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) 127static int generic_exec_single(int cpu, struct call_single_data *csd,
128 smp_call_func_t func, void *info, int wait)
126{ 129{
130 struct call_single_data csd_stack = { .flags = 0 };
131 unsigned long flags;
132
133
134 if (cpu == smp_processor_id()) {
135 local_irq_save(flags);
136 func(info);
137 local_irq_restore(flags);
138 return 0;
139 }
140
141
142 if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu))
143 return -ENXIO;
144
145
146 if (!csd) {
147 csd = &csd_stack;
148 if (!wait)
149 csd = &__get_cpu_var(csd_data);
150 }
151
152 csd_lock(csd);
153
154 csd->func = func;
155 csd->info = info;
156
127 if (wait) 157 if (wait)
128 csd->flags |= CSD_FLAG_WAIT; 158 csd->flags |= CSD_FLAG_WAIT;
129 159
@@ -143,6 +173,8 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
143 173
144 if (wait) 174 if (wait)
145 csd_lock_wait(csd); 175 csd_lock_wait(csd);
176
177 return 0;
146} 178}
147 179
148/* 180/*
@@ -151,7 +183,8 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
151 */ 183 */
152void generic_smp_call_function_single_interrupt(void) 184void generic_smp_call_function_single_interrupt(void)
153{ 185{
154 struct llist_node *entry, *next; 186 struct llist_node *entry;
187 struct call_single_data *csd, *csd_next;
155 188
156 /* 189 /*
157 * Shouldn't receive this interrupt on a cpu that is not yet online. 190 * Shouldn't receive this interrupt on a cpu that is not yet online.
@@ -161,21 +194,12 @@ void generic_smp_call_function_single_interrupt(void)
161 entry = llist_del_all(&__get_cpu_var(call_single_queue)); 194 entry = llist_del_all(&__get_cpu_var(call_single_queue));
162 entry = llist_reverse_order(entry); 195 entry = llist_reverse_order(entry);
163 196
164 while (entry) { 197 llist_for_each_entry_safe(csd, csd_next, entry, llist) {
165 struct call_single_data *csd;
166
167 next = entry->next;
168
169 csd = llist_entry(entry, struct call_single_data, llist);
170 csd->func(csd->info); 198 csd->func(csd->info);
171 csd_unlock(csd); 199 csd_unlock(csd);
172
173 entry = next;
174 } 200 }
175} 201}
176 202
177static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
178
179/* 203/*
180 * smp_call_function_single - Run a function on a specific CPU 204 * smp_call_function_single - Run a function on a specific CPU
181 * @func: The function to run. This must be fast and non-blocking. 205 * @func: The function to run. This must be fast and non-blocking.
@@ -187,12 +211,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
187int smp_call_function_single(int cpu, smp_call_func_t func, void *info, 211int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
188 int wait) 212 int wait)
189{ 213{
190 struct call_single_data d = {
191 .flags = 0,
192 };
193 unsigned long flags;
194 int this_cpu; 214 int this_cpu;
195 int err = 0; 215 int err;
196 216
197 /* 217 /*
198 * prevent preemption and reschedule on another processor, 218 * prevent preemption and reschedule on another processor,
@@ -209,32 +229,41 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
209 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() 229 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
210 && !oops_in_progress); 230 && !oops_in_progress);
211 231
212 if (cpu == this_cpu) { 232 err = generic_exec_single(cpu, NULL, func, info, wait);
213 local_irq_save(flags);
214 func(info);
215 local_irq_restore(flags);
216 } else {
217 if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
218 struct call_single_data *csd = &d;
219 233
220 if (!wait) 234 put_cpu();
221 csd = &__get_cpu_var(csd_data);
222 235
223 csd_lock(csd); 236 return err;
237}
238EXPORT_SYMBOL(smp_call_function_single);
224 239
225 csd->func = func; 240/**
226 csd->info = info; 241 * smp_call_function_single_async(): Run an asynchronous function on a
227 generic_exec_single(cpu, csd, wait); 242 * specific CPU.
228 } else { 243 * @cpu: The CPU to run on.
229 err = -ENXIO; /* CPU not online */ 244 * @csd: Pre-allocated and setup data structure
230 } 245 *
231 } 246 * Like smp_call_function_single(), but the call is asynchonous and
247 * can thus be done from contexts with disabled interrupts.
248 *
249 * The caller passes his own pre-allocated data structure
250 * (ie: embedded in an object) and is responsible for synchronizing it
251 * such that the IPIs performed on the @csd are strictly serialized.
252 *
253 * NOTE: Be careful, there is unfortunately no current debugging facility to
254 * validate the correctness of this serialization.
255 */
256int smp_call_function_single_async(int cpu, struct call_single_data *csd)
257{
258 int err = 0;
232 259
233 put_cpu(); 260 preempt_disable();
261 err = generic_exec_single(cpu, csd, csd->func, csd->info, 0);
262 preempt_enable();
234 263
235 return err; 264 return err;
236} 265}
237EXPORT_SYMBOL(smp_call_function_single); 266EXPORT_SYMBOL_GPL(smp_call_function_single_async);
238 267
239/* 268/*
240 * smp_call_function_any - Run a function on any of the given cpus 269 * smp_call_function_any - Run a function on any of the given cpus
@@ -280,44 +309,6 @@ call:
280EXPORT_SYMBOL_GPL(smp_call_function_any); 309EXPORT_SYMBOL_GPL(smp_call_function_any);
281 310
282/** 311/**
283 * __smp_call_function_single(): Run a function on a specific CPU
284 * @cpu: The CPU to run on.
285 * @data: Pre-allocated and setup data structure
286 * @wait: If true, wait until function has completed on specified CPU.
287 *
288 * Like smp_call_function_single(), but allow caller to pass in a
289 * pre-allocated data structure. Useful for embedding @data inside
290 * other structures, for instance.
291 */
292void __smp_call_function_single(int cpu, struct call_single_data *csd,
293 int wait)
294{
295 unsigned int this_cpu;
296 unsigned long flags;
297
298 this_cpu = get_cpu();
299 /*
300 * Can deadlock when called with interrupts disabled.
301 * We allow cpu's that are not yet online though, as no one else can
302 * send smp call function interrupt to this cpu and as such deadlocks
303 * can't happen.
304 */
305 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
306 && !oops_in_progress);
307
308 if (cpu == this_cpu) {
309 local_irq_save(flags);
310 csd->func(csd->info);
311 local_irq_restore(flags);
312 } else {
313 csd_lock(csd);
314 generic_exec_single(cpu, csd, wait);
315 }
316 put_cpu();
317}
318EXPORT_SYMBOL_GPL(__smp_call_function_single);
319
320/**
321 * smp_call_function_many(): Run a function on a set of other CPUs. 312 * smp_call_function_many(): Run a function on a set of other CPUs.
322 * @mask: The set of cpus to run on (only runs on online subset). 313 * @mask: The set of cpus to run on (only runs on online subset).
323 * @func: The function to run. This must be fast and non-blocking. 314 * @func: The function to run. This must be fast and non-blocking.
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 490fcbb1dc5b..b50990a5bea0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -25,6 +25,7 @@
25#include <linux/smp.h> 25#include <linux/smp.h>
26#include <linux/smpboot.h> 26#include <linux/smpboot.h>
27#include <linux/tick.h> 27#include <linux/tick.h>
28#include <linux/irq.h>
28 29
29#define CREATE_TRACE_POINTS 30#define CREATE_TRACE_POINTS
30#include <trace/events/irq.h> 31#include <trace/events/irq.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index c0a58be780a4..adaeab6f7a87 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -174,10 +174,10 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
174 174
175 /* normalize: avoid signed division (rounding problems) */ 175 /* normalize: avoid signed division (rounding problems) */
176 error = -ESRCH; 176 error = -ESRCH;
177 if (niceval < -20) 177 if (niceval < MIN_NICE)
178 niceval = -20; 178 niceval = MIN_NICE;
179 if (niceval > 19) 179 if (niceval > MAX_NICE)
180 niceval = 19; 180 niceval = MAX_NICE;
181 181
182 rcu_read_lock(); 182 rcu_read_lock();
183 read_lock(&tasklist_lock); 183 read_lock(&tasklist_lock);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7078052284fd..bc8d1b74a6b9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -146,11 +146,13 @@ cond_syscall(sys_io_destroy);
146cond_syscall(sys_io_submit); 146cond_syscall(sys_io_submit);
147cond_syscall(sys_io_cancel); 147cond_syscall(sys_io_cancel);
148cond_syscall(sys_io_getevents); 148cond_syscall(sys_io_getevents);
149cond_syscall(sys_sysfs);
149cond_syscall(sys_syslog); 150cond_syscall(sys_syslog);
150cond_syscall(sys_process_vm_readv); 151cond_syscall(sys_process_vm_readv);
151cond_syscall(sys_process_vm_writev); 152cond_syscall(sys_process_vm_writev);
152cond_syscall(compat_sys_process_vm_readv); 153cond_syscall(compat_sys_process_vm_readv);
153cond_syscall(compat_sys_process_vm_writev); 154cond_syscall(compat_sys_process_vm_writev);
155cond_syscall(sys_uselib);
154 156
155/* arch-specific weak syscall entries */ 157/* arch-specific weak syscall entries */
156cond_syscall(sys_pciconfig_read); 158cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 49e13e1f8fe6..5c14b547882e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -112,9 +112,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
112#ifndef CONFIG_MMU 112#ifndef CONFIG_MMU
113extern int sysctl_nr_trim_pages; 113extern int sysctl_nr_trim_pages;
114#endif 114#endif
115#ifdef CONFIG_BLOCK
116extern int blk_iopoll_enabled;
117#endif
118 115
119/* Constants used for minimum and maximum */ 116/* Constants used for minimum and maximum */
120#ifdef CONFIG_LOCKUP_DETECTOR 117#ifdef CONFIG_LOCKUP_DETECTOR
@@ -126,7 +123,7 @@ static int __maybe_unused neg_one = -1;
126static int zero; 123static int zero;
127static int __maybe_unused one = 1; 124static int __maybe_unused one = 1;
128static int __maybe_unused two = 2; 125static int __maybe_unused two = 2;
129static int __maybe_unused three = 3; 126static int __maybe_unused four = 4;
130static unsigned long one_ul = 1; 127static unsigned long one_ul = 1;
131static int one_hundred = 100; 128static int one_hundred = 100;
132#ifdef CONFIG_PRINTK 129#ifdef CONFIG_PRINTK
@@ -386,13 +383,6 @@ static struct ctl_table kern_table[] = {
386 .proc_handler = proc_dointvec, 383 .proc_handler = proc_dointvec,
387 }, 384 },
388 { 385 {
389 .procname = "numa_balancing_migrate_deferred",
390 .data = &sysctl_numa_balancing_migrate_deferred,
391 .maxlen = sizeof(unsigned int),
392 .mode = 0644,
393 .proc_handler = proc_dointvec,
394 },
395 {
396 .procname = "numa_balancing", 386 .procname = "numa_balancing",
397 .data = NULL, /* filled in by handler */ 387 .data = NULL, /* filled in by handler */
398 .maxlen = sizeof(unsigned int), 388 .maxlen = sizeof(unsigned int),
@@ -1094,15 +1084,6 @@ static struct ctl_table kern_table[] = {
1094 .proc_handler = proc_dointvec, 1084 .proc_handler = proc_dointvec,
1095 }, 1085 },
1096#endif 1086#endif
1097#ifdef CONFIG_BLOCK
1098 {
1099 .procname = "blk_iopoll",
1100 .data = &blk_iopoll_enabled,
1101 .maxlen = sizeof(int),
1102 .mode = 0644,
1103 .proc_handler = proc_dointvec,
1104 },
1105#endif
1106 { } 1087 { }
1107}; 1088};
1108 1089
@@ -1283,7 +1264,7 @@ static struct ctl_table vm_table[] = {
1283 .mode = 0644, 1264 .mode = 0644,
1284 .proc_handler = drop_caches_sysctl_handler, 1265 .proc_handler = drop_caches_sysctl_handler,
1285 .extra1 = &one, 1266 .extra1 = &one,
1286 .extra2 = &three, 1267 .extra2 = &four,
1287 }, 1268 },
1288#ifdef CONFIG_COMPACTION 1269#ifdef CONFIG_COMPACTION
1289 { 1270 {
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 3ce6e8c5f3fc..f448513a45ed 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -124,7 +124,7 @@ config NO_HZ_FULL
124endchoice 124endchoice
125 125
126config NO_HZ_FULL_ALL 126config NO_HZ_FULL_ALL
127 bool "Full dynticks system on all CPUs by default" 127 bool "Full dynticks system on all CPUs by default (except CPU 0)"
128 depends on NO_HZ_FULL 128 depends on NO_HZ_FULL
129 help 129 help
130 If the user doesn't pass the nohz_full boot option to 130 If the user doesn't pass the nohz_full boot option to
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 9250130646f5..57a413fd0ebf 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -3,7 +3,10 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o
3 3
4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
6obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o 6ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
7 obj-y += tick-broadcast.o
8 obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o
9endif
7obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o 10obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
8obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o 11obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
9obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o 12obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 086ad6043bcb..ad362c260ef4 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -439,6 +439,19 @@ void clockevents_config_and_register(struct clock_event_device *dev,
439} 439}
440EXPORT_SYMBOL_GPL(clockevents_config_and_register); 440EXPORT_SYMBOL_GPL(clockevents_config_and_register);
441 441
442int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
443{
444 clockevents_config(dev, freq);
445
446 if (dev->mode == CLOCK_EVT_MODE_ONESHOT)
447 return clockevents_program_event(dev, dev->next_event, false);
448
449 if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
450 dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev);
451
452 return 0;
453}
454
442/** 455/**
443 * clockevents_update_freq - Update frequency and reprogram a clock event device. 456 * clockevents_update_freq - Update frequency and reprogram a clock event device.
444 * @dev: device to modify 457 * @dev: device to modify
@@ -446,17 +459,22 @@ EXPORT_SYMBOL_GPL(clockevents_config_and_register);
446 * 459 *
447 * Reconfigure and reprogram a clock event device in oneshot 460 * Reconfigure and reprogram a clock event device in oneshot
448 * mode. Must be called on the cpu for which the device delivers per 461 * mode. Must be called on the cpu for which the device delivers per
449 * cpu timer events with interrupts disabled! Returns 0 on success, 462 * cpu timer events. If called for the broadcast device the core takes
450 * -ETIME when the event is in the past. 463 * care of serialization.
464 *
465 * Returns 0 on success, -ETIME when the event is in the past.
451 */ 466 */
452int clockevents_update_freq(struct clock_event_device *dev, u32 freq) 467int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
453{ 468{
454 clockevents_config(dev, freq); 469 unsigned long flags;
455 470 int ret;
456 if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
457 return 0;
458 471
459 return clockevents_program_event(dev, dev->next_event, false); 472 local_irq_save(flags);
473 ret = tick_broadcast_update_freq(dev, freq);
474 if (ret == -ENODEV)
475 ret = __clockevents_update_freq(dev, freq);
476 local_irq_restore(flags);
477 return ret;
460} 478}
461 479
462/* 480/*
@@ -524,12 +542,13 @@ void clockevents_resume(void)
524#ifdef CONFIG_GENERIC_CLOCKEVENTS 542#ifdef CONFIG_GENERIC_CLOCKEVENTS
525/** 543/**
526 * clockevents_notify - notification about relevant events 544 * clockevents_notify - notification about relevant events
545 * Returns 0 on success, any other value on error
527 */ 546 */
528void clockevents_notify(unsigned long reason, void *arg) 547int clockevents_notify(unsigned long reason, void *arg)
529{ 548{
530 struct clock_event_device *dev, *tmp; 549 struct clock_event_device *dev, *tmp;
531 unsigned long flags; 550 unsigned long flags;
532 int cpu; 551 int cpu, ret = 0;
533 552
534 raw_spin_lock_irqsave(&clockevents_lock, flags); 553 raw_spin_lock_irqsave(&clockevents_lock, flags);
535 554
@@ -542,7 +561,7 @@ void clockevents_notify(unsigned long reason, void *arg)
542 561
543 case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: 562 case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
544 case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: 563 case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
545 tick_broadcast_oneshot_control(reason); 564 ret = tick_broadcast_oneshot_control(reason);
546 break; 565 break;
547 566
548 case CLOCK_EVT_NOTIFY_CPU_DYING: 567 case CLOCK_EVT_NOTIFY_CPU_DYING:
@@ -585,6 +604,7 @@ void clockevents_notify(unsigned long reason, void *arg)
585 break; 604 break;
586 } 605 }
587 raw_spin_unlock_irqrestore(&clockevents_lock, flags); 606 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
607 return ret;
588} 608}
589EXPORT_SYMBOL_GPL(clockevents_notify); 609EXPORT_SYMBOL_GPL(clockevents_notify);
590 610
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index af8d1d4f3d55..419a52cecd20 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -514,12 +514,13 @@ static void sync_cmos_clock(struct work_struct *work)
514 next.tv_sec++; 514 next.tv_sec++;
515 next.tv_nsec -= NSEC_PER_SEC; 515 next.tv_nsec -= NSEC_PER_SEC;
516 } 516 }
517 schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); 517 queue_delayed_work(system_power_efficient_wq,
518 &sync_cmos_work, timespec_to_jiffies(&next));
518} 519}
519 520
520void ntp_notify_cmos_timer(void) 521void ntp_notify_cmos_timer(void)
521{ 522{
522 schedule_delayed_work(&sync_cmos_work, 0); 523 queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
523} 524}
524 525
525#else 526#else
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
new file mode 100644
index 000000000000..eb682d5c697c
--- /dev/null
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -0,0 +1,106 @@
1/*
2 * linux/kernel/time/tick-broadcast-hrtimer.c
3 * This file emulates a local clock event device
4 * via a pseudo clock device.
5 */
6#include <linux/cpu.h>
7#include <linux/err.h>
8#include <linux/hrtimer.h>
9#include <linux/interrupt.h>
10#include <linux/percpu.h>
11#include <linux/profile.h>
12#include <linux/clockchips.h>
13#include <linux/sched.h>
14#include <linux/smp.h>
15#include <linux/module.h>
16
17#include "tick-internal.h"
18
19static struct hrtimer bctimer;
20
21static void bc_set_mode(enum clock_event_mode mode,
22 struct clock_event_device *bc)
23{
24 switch (mode) {
25 case CLOCK_EVT_MODE_SHUTDOWN:
26 /*
27 * Note, we cannot cancel the timer here as we might
28 * run into the following live lock scenario:
29 *
30 * cpu 0 cpu1
31 * lock(broadcast_lock);
32 * hrtimer_interrupt()
33 * bc_handler()
34 * tick_handle_oneshot_broadcast();
35 * lock(broadcast_lock);
36 * hrtimer_cancel()
37 * wait_for_callback()
38 */
39 hrtimer_try_to_cancel(&bctimer);
40 break;
41 default:
42 break;
43 }
44}
45
46/*
47 * This is called from the guts of the broadcast code when the cpu
48 * which is about to enter idle has the earliest broadcast timer event.
49 */
50static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
51{
52 /*
53 * We try to cancel the timer first. If the callback is on
54 * flight on some other cpu then we let it handle it. If we
55 * were able to cancel the timer nothing can rearm it as we
56 * own broadcast_lock.
57 *
58 * However we can also be called from the event handler of
59 * ce_broadcast_hrtimer itself when it expires. We cannot
60 * restart the timer because we are in the callback, but we
61 * can set the expiry time and let the callback return
62 * HRTIMER_RESTART.
63 */
64 if (hrtimer_try_to_cancel(&bctimer) >= 0) {
65 hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED);
66 /* Bind the "device" to the cpu */
67 bc->bound_on = smp_processor_id();
68 } else if (bc->bound_on == smp_processor_id()) {
69 hrtimer_set_expires(&bctimer, expires);
70 }
71 return 0;
72}
73
74static struct clock_event_device ce_broadcast_hrtimer = {
75 .set_mode = bc_set_mode,
76 .set_next_ktime = bc_set_next,
77 .features = CLOCK_EVT_FEAT_ONESHOT |
78 CLOCK_EVT_FEAT_KTIME |
79 CLOCK_EVT_FEAT_HRTIMER,
80 .rating = 0,
81 .bound_on = -1,
82 .min_delta_ns = 1,
83 .max_delta_ns = KTIME_MAX,
84 .min_delta_ticks = 1,
85 .max_delta_ticks = ULONG_MAX,
86 .mult = 1,
87 .shift = 0,
88 .cpumask = cpu_all_mask,
89};
90
91static enum hrtimer_restart bc_handler(struct hrtimer *t)
92{
93 ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
94
95 if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX)
96 return HRTIMER_NORESTART;
97
98 return HRTIMER_RESTART;
99}
100
101void tick_setup_hrtimer_broadcast(void)
102{
103 hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
104 bctimer.function = bc_handler;
105 clockevents_register_device(&ce_broadcast_hrtimer);
106}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 98977a57ac72..64c5990fd500 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -120,6 +120,19 @@ int tick_is_broadcast_device(struct clock_event_device *dev)
120 return (dev && tick_broadcast_device.evtdev == dev); 120 return (dev && tick_broadcast_device.evtdev == dev);
121} 121}
122 122
123int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq)
124{
125 int ret = -ENODEV;
126
127 if (tick_is_broadcast_device(dev)) {
128 raw_spin_lock(&tick_broadcast_lock);
129 ret = __clockevents_update_freq(dev, freq);
130 raw_spin_unlock(&tick_broadcast_lock);
131 }
132 return ret;
133}
134
135
123static void err_broadcast(const struct cpumask *mask) 136static void err_broadcast(const struct cpumask *mask)
124{ 137{
125 pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); 138 pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n");
@@ -272,12 +285,8 @@ static void tick_do_broadcast(struct cpumask *mask)
272 */ 285 */
273static void tick_do_periodic_broadcast(void) 286static void tick_do_periodic_broadcast(void)
274{ 287{
275 raw_spin_lock(&tick_broadcast_lock);
276
277 cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); 288 cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
278 tick_do_broadcast(tmpmask); 289 tick_do_broadcast(tmpmask);
279
280 raw_spin_unlock(&tick_broadcast_lock);
281} 290}
282 291
283/* 292/*
@@ -287,13 +296,15 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
287{ 296{
288 ktime_t next; 297 ktime_t next;
289 298
299 raw_spin_lock(&tick_broadcast_lock);
300
290 tick_do_periodic_broadcast(); 301 tick_do_periodic_broadcast();
291 302
292 /* 303 /*
293 * The device is in periodic mode. No reprogramming necessary: 304 * The device is in periodic mode. No reprogramming necessary:
294 */ 305 */
295 if (dev->mode == CLOCK_EVT_MODE_PERIODIC) 306 if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
296 return; 307 goto unlock;
297 308
298 /* 309 /*
299 * Setup the next period for devices, which do not have 310 * Setup the next period for devices, which do not have
@@ -306,9 +317,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
306 next = ktime_add(next, tick_period); 317 next = ktime_add(next, tick_period);
307 318
308 if (!clockevents_program_event(dev, next, false)) 319 if (!clockevents_program_event(dev, next, false))
309 return; 320 goto unlock;
310 tick_do_periodic_broadcast(); 321 tick_do_periodic_broadcast();
311 } 322 }
323unlock:
324 raw_spin_unlock(&tick_broadcast_lock);
312} 325}
313 326
314/* 327/*
@@ -630,24 +643,61 @@ again:
630 raw_spin_unlock(&tick_broadcast_lock); 643 raw_spin_unlock(&tick_broadcast_lock);
631} 644}
632 645
646static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu)
647{
648 if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER))
649 return 0;
650 if (bc->next_event.tv64 == KTIME_MAX)
651 return 0;
652 return bc->bound_on == cpu ? -EBUSY : 0;
653}
654
655static void broadcast_shutdown_local(struct clock_event_device *bc,
656 struct clock_event_device *dev)
657{
658 /*
659 * For hrtimer based broadcasting we cannot shutdown the cpu
660 * local device if our own event is the first one to expire or
661 * if we own the broadcast timer.
662 */
663 if (bc->features & CLOCK_EVT_FEAT_HRTIMER) {
664 if (broadcast_needs_cpu(bc, smp_processor_id()))
665 return;
666 if (dev->next_event.tv64 < bc->next_event.tv64)
667 return;
668 }
669 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
670}
671
672static void broadcast_move_bc(int deadcpu)
673{
674 struct clock_event_device *bc = tick_broadcast_device.evtdev;
675
676 if (!bc || !broadcast_needs_cpu(bc, deadcpu))
677 return;
678 /* This moves the broadcast assignment to this cpu */
679 clockevents_program_event(bc, bc->next_event, 1);
680}
681
633/* 682/*
634 * Powerstate information: The system enters/leaves a state, where 683 * Powerstate information: The system enters/leaves a state, where
635 * affected devices might stop 684 * affected devices might stop
685 * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
636 */ 686 */
637void tick_broadcast_oneshot_control(unsigned long reason) 687int tick_broadcast_oneshot_control(unsigned long reason)
638{ 688{
639 struct clock_event_device *bc, *dev; 689 struct clock_event_device *bc, *dev;
640 struct tick_device *td; 690 struct tick_device *td;
641 unsigned long flags; 691 unsigned long flags;
642 ktime_t now; 692 ktime_t now;
643 int cpu; 693 int cpu, ret = 0;
644 694
645 /* 695 /*
646 * Periodic mode does not care about the enter/exit of power 696 * Periodic mode does not care about the enter/exit of power
647 * states 697 * states
648 */ 698 */
649 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) 699 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
650 return; 700 return 0;
651 701
652 /* 702 /*
653 * We are called with preemtion disabled from the depth of the 703 * We are called with preemtion disabled from the depth of the
@@ -658,7 +708,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
658 dev = td->evtdev; 708 dev = td->evtdev;
659 709
660 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) 710 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
661 return; 711 return 0;
662 712
663 bc = tick_broadcast_device.evtdev; 713 bc = tick_broadcast_device.evtdev;
664 714
@@ -666,7 +716,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
666 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { 716 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
667 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { 717 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
668 WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); 718 WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
669 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); 719 broadcast_shutdown_local(bc, dev);
670 /* 720 /*
671 * We only reprogram the broadcast timer if we 721 * We only reprogram the broadcast timer if we
672 * did not mark ourself in the force mask and 722 * did not mark ourself in the force mask and
@@ -679,6 +729,16 @@ void tick_broadcast_oneshot_control(unsigned long reason)
679 dev->next_event.tv64 < bc->next_event.tv64) 729 dev->next_event.tv64 < bc->next_event.tv64)
680 tick_broadcast_set_event(bc, cpu, dev->next_event, 1); 730 tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
681 } 731 }
732 /*
733 * If the current CPU owns the hrtimer broadcast
734 * mechanism, it cannot go deep idle and we remove the
735 * CPU from the broadcast mask. We don't have to go
736 * through the EXIT path as the local timer is not
737 * shutdown.
738 */
739 ret = broadcast_needs_cpu(bc, cpu);
740 if (ret)
741 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
682 } else { 742 } else {
683 if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { 743 if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
684 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 744 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
@@ -746,6 +806,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
746 } 806 }
747out: 807out:
748 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 808 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
809 return ret;
749} 810}
750 811
751/* 812/*
@@ -852,6 +913,8 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
852 cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); 913 cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
853 cpumask_clear_cpu(cpu, tick_broadcast_force_mask); 914 cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
854 915
916 broadcast_move_bc(cpu);
917
855 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 918 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
856} 919}
857 920
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 20b2fe37d105..015661279b68 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -98,18 +98,19 @@ static void tick_periodic(int cpu)
98void tick_handle_periodic(struct clock_event_device *dev) 98void tick_handle_periodic(struct clock_event_device *dev)
99{ 99{
100 int cpu = smp_processor_id(); 100 int cpu = smp_processor_id();
101 ktime_t next; 101 ktime_t next = dev->next_event;
102 102
103 tick_periodic(cpu); 103 tick_periodic(cpu);
104 104
105 if (dev->mode != CLOCK_EVT_MODE_ONESHOT) 105 if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
106 return; 106 return;
107 /*
108 * Setup the next period for devices, which do not have
109 * periodic mode:
110 */
111 next = ktime_add(dev->next_event, tick_period);
112 for (;;) { 107 for (;;) {
108 /*
109 * Setup the next period for devices, which do not have
110 * periodic mode:
111 */
112 next = ktime_add(next, tick_period);
113
113 if (!clockevents_program_event(dev, next, false)) 114 if (!clockevents_program_event(dev, next, false))
114 return; 115 return;
115 /* 116 /*
@@ -118,12 +119,11 @@ void tick_handle_periodic(struct clock_event_device *dev)
118 * to be sure we're using a real hardware clocksource. 119 * to be sure we're using a real hardware clocksource.
119 * Otherwise we could get trapped in an infinite 120 * Otherwise we could get trapped in an infinite
120 * loop, as the tick_periodic() increments jiffies, 121 * loop, as the tick_periodic() increments jiffies,
121 * when then will increment time, posibly causing 122 * which then will increment time, possibly causing
122 * the loop to trigger again and again. 123 * the loop to trigger again and again.
123 */ 124 */
124 if (timekeeping_valid_for_hres()) 125 if (timekeeping_valid_for_hres())
125 tick_periodic(cpu); 126 tick_periodic(cpu);
126 next = ktime_add(next, tick_period);
127 } 127 }
128} 128}
129 129
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 8329669b51ec..7ab92b19965a 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -46,7 +46,7 @@ extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
46extern void tick_resume_oneshot(void); 46extern void tick_resume_oneshot(void);
47# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 47# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
48extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); 48extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
49extern void tick_broadcast_oneshot_control(unsigned long reason); 49extern int tick_broadcast_oneshot_control(unsigned long reason);
50extern void tick_broadcast_switch_to_oneshot(void); 50extern void tick_broadcast_switch_to_oneshot(void);
51extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); 51extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
52extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); 52extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
@@ -58,7 +58,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
58{ 58{
59 BUG(); 59 BUG();
60} 60}
61static inline void tick_broadcast_oneshot_control(unsigned long reason) { } 61static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
62static inline void tick_broadcast_switch_to_oneshot(void) { } 62static inline void tick_broadcast_switch_to_oneshot(void) { }
63static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 63static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
64static inline int tick_broadcast_oneshot_active(void) { return 0; } 64static inline int tick_broadcast_oneshot_active(void) { return 0; }
@@ -87,7 +87,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
87{ 87{
88 BUG(); 88 BUG();
89} 89}
90static inline void tick_broadcast_oneshot_control(unsigned long reason) { } 90static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
91static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 91static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
92static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) 92static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
93{ 93{
@@ -111,6 +111,7 @@ extern int tick_resume_broadcast(void);
111extern void tick_broadcast_init(void); 111extern void tick_broadcast_init(void);
112extern void 112extern void
113tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); 113tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
114int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
114 115
115#else /* !BROADCAST */ 116#else /* !BROADCAST */
116 117
@@ -133,6 +134,8 @@ static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
133static inline void tick_suspend_broadcast(void) { } 134static inline void tick_suspend_broadcast(void) { }
134static inline int tick_resume_broadcast(void) { return 0; } 135static inline int tick_resume_broadcast(void) { return 0; }
135static inline void tick_broadcast_init(void) { } 136static inline void tick_broadcast_init(void) { }
137static inline int tick_broadcast_update_freq(struct clock_event_device *dev,
138 u32 freq) { return -ENODEV; }
136 139
137/* 140/*
138 * Set the periodic handler in non broadcast mode 141 * Set the periodic handler in non broadcast mode
@@ -152,6 +155,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
152 return !(dev->features & CLOCK_EVT_FEAT_DUMMY); 155 return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
153} 156}
154 157
158int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
159
155#endif 160#endif
156 161
157extern void do_timer(unsigned long ticks); 162extern void do_timer(unsigned long ticks);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 0aa4ce81bc16..5b40279ecd71 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1435,7 +1435,8 @@ void update_wall_time(void)
1435out: 1435out:
1436 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1436 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1437 if (clock_set) 1437 if (clock_set)
1438 clock_was_set(); 1438 /* Have to call _delayed version, since in irq context*/
1439 clock_was_set_delayed();
1439} 1440}
1440 1441
1441/** 1442/**
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 802433a4f5eb..4d54f97558df 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -21,6 +21,8 @@
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/time.h> 22#include <linux/time.h>
23 23
24#include "timekeeping_internal.h"
25
24static unsigned int sleep_time_bin[32] = {0}; 26static unsigned int sleep_time_bin[32] = {0};
25 27
26static int tk_debug_show_sleep_time(struct seq_file *s, void *data) 28static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
diff --git a/kernel/timer.c b/kernel/timer.c
index accfd241b9e5..87bd529879c2 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -52,7 +52,7 @@
52#define CREATE_TRACE_POINTS 52#define CREATE_TRACE_POINTS
53#include <trace/events/timer.h> 53#include <trace/events/timer.h>
54 54
55u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; 55__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
56 56
57EXPORT_SYMBOL(jiffies_64); 57EXPORT_SYMBOL(jiffies_64);
58 58
@@ -81,6 +81,7 @@ struct tvec_base {
81 unsigned long timer_jiffies; 81 unsigned long timer_jiffies;
82 unsigned long next_timer; 82 unsigned long next_timer;
83 unsigned long active_timers; 83 unsigned long active_timers;
84 unsigned long all_timers;
84 struct tvec_root tv1; 85 struct tvec_root tv1;
85 struct tvec tv2; 86 struct tvec tv2;
86 struct tvec tv3; 87 struct tvec tv3;
@@ -337,6 +338,20 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
337} 338}
338EXPORT_SYMBOL_GPL(set_timer_slack); 339EXPORT_SYMBOL_GPL(set_timer_slack);
339 340
341/*
342 * If the list is empty, catch up ->timer_jiffies to the current time.
343 * The caller must hold the tvec_base lock. Returns true if the list
344 * was empty and therefore ->timer_jiffies was updated.
345 */
346static bool catchup_timer_jiffies(struct tvec_base *base)
347{
348 if (!base->all_timers) {
349 base->timer_jiffies = jiffies;
350 return true;
351 }
352 return false;
353}
354
340static void 355static void
341__internal_add_timer(struct tvec_base *base, struct timer_list *timer) 356__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
342{ 357{
@@ -383,15 +398,17 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
383 398
384static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) 399static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
385{ 400{
401 (void)catchup_timer_jiffies(base);
386 __internal_add_timer(base, timer); 402 __internal_add_timer(base, timer);
387 /* 403 /*
388 * Update base->active_timers and base->next_timer 404 * Update base->active_timers and base->next_timer
389 */ 405 */
390 if (!tbase_get_deferrable(timer->base)) { 406 if (!tbase_get_deferrable(timer->base)) {
391 if (time_before(timer->expires, base->next_timer)) 407 if (!base->active_timers++ ||
408 time_before(timer->expires, base->next_timer))
392 base->next_timer = timer->expires; 409 base->next_timer = timer->expires;
393 base->active_timers++;
394 } 410 }
411 base->all_timers++;
395} 412}
396 413
397#ifdef CONFIG_TIMER_STATS 414#ifdef CONFIG_TIMER_STATS
@@ -671,6 +688,8 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
671 detach_timer(timer, true); 688 detach_timer(timer, true);
672 if (!tbase_get_deferrable(timer->base)) 689 if (!tbase_get_deferrable(timer->base))
673 base->active_timers--; 690 base->active_timers--;
691 base->all_timers--;
692 (void)catchup_timer_jiffies(base);
674} 693}
675 694
676static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, 695static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
@@ -685,6 +704,8 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
685 if (timer->expires == base->next_timer) 704 if (timer->expires == base->next_timer)
686 base->next_timer = base->timer_jiffies; 705 base->next_timer = base->timer_jiffies;
687 } 706 }
707 base->all_timers--;
708 (void)catchup_timer_jiffies(base);
688 return 1; 709 return 1;
689} 710}
690 711
@@ -739,12 +760,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
739 760
740 debug_activate(timer, expires); 761 debug_activate(timer, expires);
741 762
742 cpu = smp_processor_id(); 763 cpu = get_nohz_timer_target(pinned);
743
744#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
745 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
746 cpu = get_nohz_timer_target();
747#endif
748 new_base = per_cpu(tvec_bases, cpu); 764 new_base = per_cpu(tvec_bases, cpu);
749 765
750 if (base != new_base) { 766 if (base != new_base) {
@@ -939,8 +955,15 @@ void add_timer_on(struct timer_list *timer, int cpu)
939 * with the timer by holding the timer base lock. This also 955 * with the timer by holding the timer base lock. This also
940 * makes sure that a CPU on the way to stop its tick can not 956 * makes sure that a CPU on the way to stop its tick can not
941 * evaluate the timer wheel. 957 * evaluate the timer wheel.
958 *
959 * Spare the IPI for deferrable timers on idle targets though.
960 * The next busy ticks will take care of it. Except full dynticks
961 * require special care against races with idle_cpu(), lets deal
962 * with that later.
942 */ 963 */
943 wake_up_nohz_cpu(cpu); 964 if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu))
965 wake_up_nohz_cpu(cpu);
966
944 spin_unlock_irqrestore(&base->lock, flags); 967 spin_unlock_irqrestore(&base->lock, flags);
945} 968}
946EXPORT_SYMBOL_GPL(add_timer_on); 969EXPORT_SYMBOL_GPL(add_timer_on);
@@ -1146,6 +1169,10 @@ static inline void __run_timers(struct tvec_base *base)
1146 struct timer_list *timer; 1169 struct timer_list *timer;
1147 1170
1148 spin_lock_irq(&base->lock); 1171 spin_lock_irq(&base->lock);
1172 if (catchup_timer_jiffies(base)) {
1173 spin_unlock_irq(&base->lock);
1174 return;
1175 }
1149 while (time_after_eq(jiffies, base->timer_jiffies)) { 1176 while (time_after_eq(jiffies, base->timer_jiffies)) {
1150 struct list_head work_list; 1177 struct list_head work_list;
1151 struct list_head *head = &work_list; 1178 struct list_head *head = &work_list;
@@ -1160,7 +1187,7 @@ static inline void __run_timers(struct tvec_base *base)
1160 !cascade(base, &base->tv4, INDEX(2))) 1187 !cascade(base, &base->tv4, INDEX(2)))
1161 cascade(base, &base->tv5, INDEX(3)); 1188 cascade(base, &base->tv5, INDEX(3));
1162 ++base->timer_jiffies; 1189 ++base->timer_jiffies;
1163 list_replace_init(base->tv1.vec + index, &work_list); 1190 list_replace_init(base->tv1.vec + index, head);
1164 while (!list_empty(head)) { 1191 while (!list_empty(head)) {
1165 void (*fn)(unsigned long); 1192 void (*fn)(unsigned long);
1166 unsigned long data; 1193 unsigned long data;
@@ -1523,9 +1550,8 @@ static int init_timers_cpu(int cpu)
1523 if (!base) 1550 if (!base)
1524 return -ENOMEM; 1551 return -ENOMEM;
1525 1552
1526 /* Make sure that tvec_base is 2 byte aligned */ 1553 /* Make sure tvec_base has TIMER_FLAG_MASK bits free */
1527 if (tbase_get_deferrable(base)) { 1554 if (WARN_ON(base != tbase_get_base(base))) {
1528 WARN_ON(1);
1529 kfree(base); 1555 kfree(base);
1530 return -ENOMEM; 1556 return -ENOMEM;
1531 } 1557 }
@@ -1559,6 +1585,7 @@ static int init_timers_cpu(int cpu)
1559 base->timer_jiffies = jiffies; 1585 base->timer_jiffies = jiffies;
1560 base->next_timer = base->timer_jiffies; 1586 base->next_timer = base->timer_jiffies;
1561 base->active_timers = 0; 1587 base->active_timers = 0;
1588 base->all_timers = 0;
1562 return 0; 1589 return 0;
1563} 1590}
1564 1591
@@ -1648,9 +1675,9 @@ void __init init_timers(void)
1648 1675
1649 err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, 1676 err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
1650 (void *)(long)smp_processor_id()); 1677 (void *)(long)smp_processor_id());
1651 init_timer_stats();
1652
1653 BUG_ON(err != NOTIFY_OK); 1678 BUG_ON(err != NOTIFY_OK);
1679
1680 init_timer_stats();
1654 register_cpu_notifier(&timers_nb); 1681 register_cpu_notifier(&timers_nb);
1655 open_softirq(TIMER_SOFTIRQ, run_timer_softirq); 1682 open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
1656} 1683}
diff --git a/kernel/torture.c b/kernel/torture.c
new file mode 100644
index 000000000000..acc9afc2f26e
--- /dev/null
+++ b/kernel/torture.c
@@ -0,0 +1,719 @@
1/*
2 * Common functions for in-kernel torture tests.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 *
18 * Copyright (C) IBM Corporation, 2014
19 *
20 * Author: Paul E. McKenney <paulmck@us.ibm.com>
21 * Based on kernel/rcu/torture.c.
22 */
23#include <linux/types.h>
24#include <linux/kernel.h>
25#include <linux/init.h>
26#include <linux/module.h>
27#include <linux/kthread.h>
28#include <linux/err.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/interrupt.h>
32#include <linux/sched.h>
33#include <linux/atomic.h>
34#include <linux/bitops.h>
35#include <linux/completion.h>
36#include <linux/moduleparam.h>
37#include <linux/percpu.h>
38#include <linux/notifier.h>
39#include <linux/reboot.h>
40#include <linux/freezer.h>
41#include <linux/cpu.h>
42#include <linux/delay.h>
43#include <linux/stat.h>
44#include <linux/slab.h>
45#include <linux/trace_clock.h>
46#include <asm/byteorder.h>
47#include <linux/torture.h>
48
49MODULE_LICENSE("GPL");
50MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
51
52static char *torture_type;
53static bool verbose;
54
55/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
56#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
57#define FULLSTOP_SHUTDOWN 1 /* System shutdown with torture running. */
58#define FULLSTOP_RMMOD 2 /* Normal rmmod of torture. */
59static int fullstop = FULLSTOP_RMMOD;
60static DEFINE_MUTEX(fullstop_mutex);
61static int *torture_runnable;
62
63#ifdef CONFIG_HOTPLUG_CPU
64
65/*
66 * Variables for online-offline handling. Only present if CPU hotplug
67 * is enabled, otherwise does nothing.
68 */
69
70static struct task_struct *onoff_task;
71static long onoff_holdoff;
72static long onoff_interval;
73static long n_offline_attempts;
74static long n_offline_successes;
75static unsigned long sum_offline;
76static int min_offline = -1;
77static int max_offline;
78static long n_online_attempts;
79static long n_online_successes;
80static unsigned long sum_online;
81static int min_online = -1;
82static int max_online;
83
84/*
85 * Execute random CPU-hotplug operations at the interval specified
86 * by the onoff_interval.
87 */
88static int
89torture_onoff(void *arg)
90{
91 int cpu;
92 unsigned long delta;
93 int maxcpu = -1;
94 DEFINE_TORTURE_RANDOM(rand);
95 int ret;
96 unsigned long starttime;
97
98 VERBOSE_TOROUT_STRING("torture_onoff task started");
99 for_each_online_cpu(cpu)
100 maxcpu = cpu;
101 WARN_ON(maxcpu < 0);
102 if (onoff_holdoff > 0) {
103 VERBOSE_TOROUT_STRING("torture_onoff begin holdoff");
104 schedule_timeout_interruptible(onoff_holdoff);
105 VERBOSE_TOROUT_STRING("torture_onoff end holdoff");
106 }
107 while (!torture_must_stop()) {
108 cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
109 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
110 if (verbose)
111 pr_alert("%s" TORTURE_FLAG
112 "torture_onoff task: offlining %d\n",
113 torture_type, cpu);
114 starttime = jiffies;
115 n_offline_attempts++;
116 ret = cpu_down(cpu);
117 if (ret) {
118 if (verbose)
119 pr_alert("%s" TORTURE_FLAG
120 "torture_onoff task: offline %d failed: errno %d\n",
121 torture_type, cpu, ret);
122 } else {
123 if (verbose)
124 pr_alert("%s" TORTURE_FLAG
125 "torture_onoff task: offlined %d\n",
126 torture_type, cpu);
127 n_offline_successes++;
128 delta = jiffies - starttime;
129 sum_offline += delta;
130 if (min_offline < 0) {
131 min_offline = delta;
132 max_offline = delta;
133 }
134 if (min_offline > delta)
135 min_offline = delta;
136 if (max_offline < delta)
137 max_offline = delta;
138 }
139 } else if (cpu_is_hotpluggable(cpu)) {
140 if (verbose)
141 pr_alert("%s" TORTURE_FLAG
142 "torture_onoff task: onlining %d\n",
143 torture_type, cpu);
144 starttime = jiffies;
145 n_online_attempts++;
146 ret = cpu_up(cpu);
147 if (ret) {
148 if (verbose)
149 pr_alert("%s" TORTURE_FLAG
150 "torture_onoff task: online %d failed: errno %d\n",
151 torture_type, cpu, ret);
152 } else {
153 if (verbose)
154 pr_alert("%s" TORTURE_FLAG
155 "torture_onoff task: onlined %d\n",
156 torture_type, cpu);
157 n_online_successes++;
158 delta = jiffies - starttime;
159 sum_online += delta;
160 if (min_online < 0) {
161 min_online = delta;
162 max_online = delta;
163 }
164 if (min_online > delta)
165 min_online = delta;
166 if (max_online < delta)
167 max_online = delta;
168 }
169 }
170 schedule_timeout_interruptible(onoff_interval);
171 }
172 torture_kthread_stopping("torture_onoff");
173 return 0;
174}
175
176#endif /* #ifdef CONFIG_HOTPLUG_CPU */
177
178/*
179 * Initiate online-offline handling.
180 */
181int torture_onoff_init(long ooholdoff, long oointerval)
182{
183 int ret = 0;
184
185#ifdef CONFIG_HOTPLUG_CPU
186 onoff_holdoff = ooholdoff;
187 onoff_interval = oointerval;
188 if (onoff_interval <= 0)
189 return 0;
190 ret = torture_create_kthread(torture_onoff, NULL, onoff_task);
191#endif /* #ifdef CONFIG_HOTPLUG_CPU */
192 return ret;
193}
194EXPORT_SYMBOL_GPL(torture_onoff_init);
195
196/*
197 * Clean up after online/offline testing.
198 */
199static void torture_onoff_cleanup(void)
200{
201#ifdef CONFIG_HOTPLUG_CPU
202 if (onoff_task == NULL)
203 return;
204 VERBOSE_TOROUT_STRING("Stopping torture_onoff task");
205 kthread_stop(onoff_task);
206 onoff_task = NULL;
207#endif /* #ifdef CONFIG_HOTPLUG_CPU */
208}
209EXPORT_SYMBOL_GPL(torture_onoff_cleanup);
210
211/*
212 * Print online/offline testing statistics.
213 */
214char *torture_onoff_stats(char *page)
215{
216#ifdef CONFIG_HOTPLUG_CPU
217 page += sprintf(page,
218 "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
219 n_online_successes, n_online_attempts,
220 n_offline_successes, n_offline_attempts,
221 min_online, max_online,
222 min_offline, max_offline,
223 sum_online, sum_offline, HZ);
224#endif /* #ifdef CONFIG_HOTPLUG_CPU */
225 return page;
226}
227EXPORT_SYMBOL_GPL(torture_onoff_stats);
228
229/*
230 * Were all the online/offline operations successful?
231 */
232bool torture_onoff_failures(void)
233{
234#ifdef CONFIG_HOTPLUG_CPU
235 return n_online_successes != n_online_attempts ||
236 n_offline_successes != n_offline_attempts;
237#else /* #ifdef CONFIG_HOTPLUG_CPU */
238 return false;
239#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
240}
241EXPORT_SYMBOL_GPL(torture_onoff_failures);
242
243#define TORTURE_RANDOM_MULT 39916801 /* prime */
244#define TORTURE_RANDOM_ADD 479001701 /* prime */
245#define TORTURE_RANDOM_REFRESH 10000
246
247/*
248 * Crude but fast random-number generator. Uses a linear congruential
249 * generator, with occasional help from cpu_clock().
250 */
251unsigned long
252torture_random(struct torture_random_state *trsp)
253{
254 if (--trsp->trs_count < 0) {
255 trsp->trs_state += (unsigned long)local_clock();
256 trsp->trs_count = TORTURE_RANDOM_REFRESH;
257 }
258 trsp->trs_state = trsp->trs_state * TORTURE_RANDOM_MULT +
259 TORTURE_RANDOM_ADD;
260 return swahw32(trsp->trs_state);
261}
262EXPORT_SYMBOL_GPL(torture_random);
263
264/*
265 * Variables for shuffling. The idea is to ensure that each CPU stays
266 * idle for an extended period to test interactions with dyntick idle,
267 * as well as interactions with any per-CPU varibles.
268 */
269struct shuffle_task {
270 struct list_head st_l;
271 struct task_struct *st_t;
272};
273
274static long shuffle_interval; /* In jiffies. */
275static struct task_struct *shuffler_task;
276static cpumask_var_t shuffle_tmp_mask;
277static int shuffle_idle_cpu; /* Force all torture tasks off this CPU */
278static struct list_head shuffle_task_list = LIST_HEAD_INIT(shuffle_task_list);
279static DEFINE_MUTEX(shuffle_task_mutex);
280
281/*
282 * Register a task to be shuffled. If there is no memory, just splat
283 * and don't bother registering.
284 */
285void torture_shuffle_task_register(struct task_struct *tp)
286{
287 struct shuffle_task *stp;
288
289 if (WARN_ON_ONCE(tp == NULL))
290 return;
291 stp = kmalloc(sizeof(*stp), GFP_KERNEL);
292 if (WARN_ON_ONCE(stp == NULL))
293 return;
294 stp->st_t = tp;
295 mutex_lock(&shuffle_task_mutex);
296 list_add(&stp->st_l, &shuffle_task_list);
297 mutex_unlock(&shuffle_task_mutex);
298}
299EXPORT_SYMBOL_GPL(torture_shuffle_task_register);
300
301/*
302 * Unregister all tasks, for example, at the end of the torture run.
303 */
304static void torture_shuffle_task_unregister_all(void)
305{
306 struct shuffle_task *stp;
307 struct shuffle_task *p;
308
309 mutex_lock(&shuffle_task_mutex);
310 list_for_each_entry_safe(stp, p, &shuffle_task_list, st_l) {
311 list_del(&stp->st_l);
312 kfree(stp);
313 }
314 mutex_unlock(&shuffle_task_mutex);
315}
316
317/* Shuffle tasks such that we allow shuffle_idle_cpu to become idle.
318 * A special case is when shuffle_idle_cpu = -1, in which case we allow
319 * the tasks to run on all CPUs.
320 */
321static void torture_shuffle_tasks(void)
322{
323 struct shuffle_task *stp;
324
325 cpumask_setall(shuffle_tmp_mask);
326 get_online_cpus();
327
328 /* No point in shuffling if there is only one online CPU (ex: UP) */
329 if (num_online_cpus() == 1) {
330 put_online_cpus();
331 return;
332 }
333
334 /* Advance to the next CPU. Upon overflow, don't idle any CPUs. */
335 shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask);
336 if (shuffle_idle_cpu >= nr_cpu_ids)
337 shuffle_idle_cpu = -1;
338 if (shuffle_idle_cpu != -1) {
339 cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask);
340 if (cpumask_empty(shuffle_tmp_mask)) {
341 put_online_cpus();
342 return;
343 }
344 }
345
346 mutex_lock(&shuffle_task_mutex);
347 list_for_each_entry(stp, &shuffle_task_list, st_l)
348 set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask);
349 mutex_unlock(&shuffle_task_mutex);
350
351 put_online_cpus();
352}
353
354/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
355 * system to become idle at a time and cut off its timer ticks. This is meant
356 * to test the support for such tickless idle CPU in RCU.
357 */
358static int torture_shuffle(void *arg)
359{
360 VERBOSE_TOROUT_STRING("torture_shuffle task started");
361 do {
362 schedule_timeout_interruptible(shuffle_interval);
363 torture_shuffle_tasks();
364 torture_shutdown_absorb("torture_shuffle");
365 } while (!torture_must_stop());
366 torture_kthread_stopping("torture_shuffle");
367 return 0;
368}
369
370/*
371 * Start the shuffler, with shuffint in jiffies.
372 */
373int torture_shuffle_init(long shuffint)
374{
375 shuffle_interval = shuffint;
376
377 shuffle_idle_cpu = -1;
378
379 if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
380 VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask");
381 return -ENOMEM;
382 }
383
384 /* Create the shuffler thread */
385 return torture_create_kthread(torture_shuffle, NULL, shuffler_task);
386}
387EXPORT_SYMBOL_GPL(torture_shuffle_init);
388
389/*
390 * Stop the shuffling.
391 */
392static void torture_shuffle_cleanup(void)
393{
394 torture_shuffle_task_unregister_all();
395 if (shuffler_task) {
396 VERBOSE_TOROUT_STRING("Stopping torture_shuffle task");
397 kthread_stop(shuffler_task);
398 free_cpumask_var(shuffle_tmp_mask);
399 }
400 shuffler_task = NULL;
401}
402EXPORT_SYMBOL_GPL(torture_shuffle_cleanup);
403
404/*
405 * Variables for auto-shutdown. This allows "lights out" torture runs
406 * to be fully scripted.
407 */
408static int shutdown_secs; /* desired test duration in seconds. */
409static struct task_struct *shutdown_task;
410static unsigned long shutdown_time; /* jiffies to system shutdown. */
411static void (*torture_shutdown_hook)(void);
412
413/*
414 * Absorb kthreads into a kernel function that won't return, so that
415 * they won't ever access module text or data again.
416 */
417void torture_shutdown_absorb(const char *title)
418{
419 while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
420 pr_notice("torture thread %s parking due to system shutdown\n",
421 title);
422 schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
423 }
424}
425EXPORT_SYMBOL_GPL(torture_shutdown_absorb);
426
427/*
428 * Cause the torture test to shutdown the system after the test has
429 * run for the time specified by the shutdown_secs parameter.
430 */
431static int torture_shutdown(void *arg)
432{
433 long delta;
434 unsigned long jiffies_snap;
435
436 VERBOSE_TOROUT_STRING("torture_shutdown task started");
437 jiffies_snap = jiffies;
438 while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
439 !torture_must_stop()) {
440 delta = shutdown_time - jiffies_snap;
441 if (verbose)
442 pr_alert("%s" TORTURE_FLAG
443 "torture_shutdown task: %lu jiffies remaining\n",
444 torture_type, delta);
445 schedule_timeout_interruptible(delta);
446 jiffies_snap = jiffies;
447 }
448 if (torture_must_stop()) {
449 torture_kthread_stopping("torture_shutdown");
450 return 0;
451 }
452
453 /* OK, shut down the system. */
454
455 VERBOSE_TOROUT_STRING("torture_shutdown task shutting down system");
456 shutdown_task = NULL; /* Avoid self-kill deadlock. */
457 if (torture_shutdown_hook)
458 torture_shutdown_hook();
459 else
460 VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping.");
461 kernel_power_off(); /* Shut down the system. */
462 return 0;
463}
464
465/*
466 * Start up the shutdown task.
467 */
468int torture_shutdown_init(int ssecs, void (*cleanup)(void))
469{
470 int ret = 0;
471
472 shutdown_secs = ssecs;
473 torture_shutdown_hook = cleanup;
474 if (shutdown_secs > 0) {
475 shutdown_time = jiffies + shutdown_secs * HZ;
476 ret = torture_create_kthread(torture_shutdown, NULL,
477 shutdown_task);
478 }
479 return ret;
480}
481EXPORT_SYMBOL_GPL(torture_shutdown_init);
482
483/*
484 * Detect and respond to a system shutdown.
485 */
486static int torture_shutdown_notify(struct notifier_block *unused1,
487 unsigned long unused2, void *unused3)
488{
489 mutex_lock(&fullstop_mutex);
490 if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
491 VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected");
492 ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN;
493 } else {
494 pr_warn("Concurrent rmmod and shutdown illegal!\n");
495 }
496 mutex_unlock(&fullstop_mutex);
497 return NOTIFY_DONE;
498}
499
500static struct notifier_block torture_shutdown_nb = {
501 .notifier_call = torture_shutdown_notify,
502};
503
504/*
505 * Shut down the shutdown task. Say what??? Heh! This can happen if
506 * the torture module gets an rmmod before the shutdown time arrives. ;-)
507 */
508static void torture_shutdown_cleanup(void)
509{
510 unregister_reboot_notifier(&torture_shutdown_nb);
511 if (shutdown_task != NULL) {
512 VERBOSE_TOROUT_STRING("Stopping torture_shutdown task");
513 kthread_stop(shutdown_task);
514 }
515 shutdown_task = NULL;
516}
517
518/*
519 * Variables for stuttering, which means to periodically pause and
520 * restart testing in order to catch bugs that appear when load is
521 * suddenly applied to or removed from the system.
522 */
523static struct task_struct *stutter_task;
524static int stutter_pause_test;
525static int stutter;
526
527/*
528 * Block until the stutter interval ends. This must be called periodically
529 * by all running kthreads that need to be subject to stuttering.
530 */
531void stutter_wait(const char *title)
532{
533 while (ACCESS_ONCE(stutter_pause_test) ||
534 (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
535 if (stutter_pause_test)
536 schedule_timeout_interruptible(1);
537 else
538 schedule_timeout_interruptible(round_jiffies_relative(HZ));
539 torture_shutdown_absorb(title);
540 }
541}
542EXPORT_SYMBOL_GPL(stutter_wait);
543
544/*
545 * Cause the torture test to "stutter", starting and stopping all
546 * threads periodically.
547 */
548static int torture_stutter(void *arg)
549{
550 VERBOSE_TOROUT_STRING("torture_stutter task started");
551 do {
552 if (!torture_must_stop()) {
553 schedule_timeout_interruptible(stutter);
554 ACCESS_ONCE(stutter_pause_test) = 1;
555 }
556 if (!torture_must_stop())
557 schedule_timeout_interruptible(stutter);
558 ACCESS_ONCE(stutter_pause_test) = 0;
559 torture_shutdown_absorb("torture_stutter");
560 } while (!torture_must_stop());
561 torture_kthread_stopping("torture_stutter");
562 return 0;
563}
564
565/*
566 * Initialize and kick off the torture_stutter kthread.
567 */
568int torture_stutter_init(int s)
569{
570 int ret;
571
572 stutter = s;
573 ret = torture_create_kthread(torture_stutter, NULL, stutter_task);
574 return ret;
575}
576EXPORT_SYMBOL_GPL(torture_stutter_init);
577
578/*
579 * Cleanup after the torture_stutter kthread.
580 */
581static void torture_stutter_cleanup(void)
582{
583 if (!stutter_task)
584 return;
585 VERBOSE_TOROUT_STRING("Stopping torture_stutter task");
586 kthread_stop(stutter_task);
587 stutter_task = NULL;
588}
589
590/*
591 * Initialize torture module. Please note that this is -not- invoked via
592 * the usual module_init() mechanism, but rather by an explicit call from
593 * the client torture module. This call must be paired with a later
594 * torture_init_end().
595 *
596 * The runnable parameter points to a flag that controls whether or not
597 * the test is currently runnable. If there is no such flag, pass in NULL.
598 */
599void __init torture_init_begin(char *ttype, bool v, int *runnable)
600{
601 mutex_lock(&fullstop_mutex);
602 torture_type = ttype;
603 verbose = v;
604 torture_runnable = runnable;
605 fullstop = FULLSTOP_DONTSTOP;
606
607}
608EXPORT_SYMBOL_GPL(torture_init_begin);
609
610/*
611 * Tell the torture module that initialization is complete.
612 */
613void __init torture_init_end(void)
614{
615 mutex_unlock(&fullstop_mutex);
616 register_reboot_notifier(&torture_shutdown_nb);
617}
618EXPORT_SYMBOL_GPL(torture_init_end);
619
620/*
621 * Clean up torture module. Please note that this is -not- invoked via
622 * the usual module_exit() mechanism, but rather by an explicit call from
623 * the client torture module. Returns true if a race with system shutdown
624 * is detected, otherwise, all kthreads started by functions in this file
625 * will be shut down.
626 *
627 * This must be called before the caller starts shutting down its own
628 * kthreads.
629 */
630bool torture_cleanup(void)
631{
632 mutex_lock(&fullstop_mutex);
633 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
634 pr_warn("Concurrent rmmod and shutdown illegal!\n");
635 mutex_unlock(&fullstop_mutex);
636 schedule_timeout_uninterruptible(10);
637 return true;
638 }
639 ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD;
640 mutex_unlock(&fullstop_mutex);
641 torture_shutdown_cleanup();
642 torture_shuffle_cleanup();
643 torture_stutter_cleanup();
644 torture_onoff_cleanup();
645 return false;
646}
647EXPORT_SYMBOL_GPL(torture_cleanup);
648
649/*
650 * Is it time for the current torture test to stop?
651 */
652bool torture_must_stop(void)
653{
654 return torture_must_stop_irq() || kthread_should_stop();
655}
656EXPORT_SYMBOL_GPL(torture_must_stop);
657
658/*
659 * Is it time for the current torture test to stop? This is the irq-safe
660 * version, hence no check for kthread_should_stop().
661 */
662bool torture_must_stop_irq(void)
663{
664 return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP;
665}
666EXPORT_SYMBOL_GPL(torture_must_stop_irq);
667
668/*
669 * Each kthread must wait for kthread_should_stop() before returning from
670 * its top-level function, otherwise segfaults ensue. This function
671 * prints a "stopping" message and waits for kthread_should_stop(), and
672 * should be called from all torture kthreads immediately prior to
673 * returning.
674 */
675void torture_kthread_stopping(char *title)
676{
677 if (verbose)
678 VERBOSE_TOROUT_STRING(title);
679 while (!kthread_should_stop()) {
680 torture_shutdown_absorb(title);
681 schedule_timeout_uninterruptible(1);
682 }
683}
684EXPORT_SYMBOL_GPL(torture_kthread_stopping);
685
686/*
687 * Create a generic torture kthread that is immediately runnable. If you
688 * need the kthread to be stopped so that you can do something to it before
689 * it starts, you will need to open-code your own.
690 */
691int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
692 char *f, struct task_struct **tp)
693{
694 int ret = 0;
695
696 VERBOSE_TOROUT_STRING(m);
697 *tp = kthread_run(fn, arg, s);
698 if (IS_ERR(*tp)) {
699 ret = PTR_ERR(*tp);
700 VERBOSE_TOROUT_ERRSTRING(f);
701 *tp = NULL;
702 }
703 torture_shuffle_task_register(*tp);
704 return ret;
705}
706EXPORT_SYMBOL_GPL(_torture_create_kthread);
707
708/*
709 * Stop a generic kthread, emitting a message.
710 */
711void _torture_stop_kthread(char *m, struct task_struct **tp)
712{
713 if (*tp == NULL)
714 return;
715 VERBOSE_TOROUT_STRING(m);
716 kthread_stop(*tp);
717 *tp = NULL;
718}
719EXPORT_SYMBOL_GPL(_torture_stop_kthread);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 015f85aaca08..8639819f6cef 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -424,6 +424,7 @@ config UPROBE_EVENT
424 bool "Enable uprobes-based dynamic events" 424 bool "Enable uprobes-based dynamic events"
425 depends on ARCH_SUPPORTS_UPROBES 425 depends on ARCH_SUPPORTS_UPROBES
426 depends on MMU 426 depends on MMU
427 depends on PERF_EVENTS
427 select UPROBES 428 select UPROBES
428 select PROBE_EVENTS 429 select PROBE_EVENTS
429 select TRACING 430 select TRACING
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b418cb0d7242..c1bd4ada2a04 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -702,6 +702,7 @@ void blk_trace_shutdown(struct request_queue *q)
702 * blk_add_trace_rq - Add a trace for a request oriented action 702 * blk_add_trace_rq - Add a trace for a request oriented action
703 * @q: queue the io is for 703 * @q: queue the io is for
704 * @rq: the source request 704 * @rq: the source request
705 * @nr_bytes: number of completed bytes
705 * @what: the action 706 * @what: the action
706 * 707 *
707 * Description: 708 * Description:
@@ -709,7 +710,7 @@ void blk_trace_shutdown(struct request_queue *q)
709 * 710 *
710 **/ 711 **/
711static void blk_add_trace_rq(struct request_queue *q, struct request *rq, 712static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
712 u32 what) 713 unsigned int nr_bytes, u32 what)
713{ 714{
714 struct blk_trace *bt = q->blk_trace; 715 struct blk_trace *bt = q->blk_trace;
715 716
@@ -718,11 +719,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
718 719
719 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 720 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
720 what |= BLK_TC_ACT(BLK_TC_PC); 721 what |= BLK_TC_ACT(BLK_TC_PC);
721 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags, 722 __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags,
722 what, rq->errors, rq->cmd_len, rq->cmd); 723 what, rq->errors, rq->cmd_len, rq->cmd);
723 } else { 724 } else {
724 what |= BLK_TC_ACT(BLK_TC_FS); 725 what |= BLK_TC_ACT(BLK_TC_FS);
725 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 726 __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes,
726 rq->cmd_flags, what, rq->errors, 0, NULL); 727 rq->cmd_flags, what, rq->errors, 0, NULL);
727 } 728 }
728} 729}
@@ -730,33 +731,34 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
730static void blk_add_trace_rq_abort(void *ignore, 731static void blk_add_trace_rq_abort(void *ignore,
731 struct request_queue *q, struct request *rq) 732 struct request_queue *q, struct request *rq)
732{ 733{
733 blk_add_trace_rq(q, rq, BLK_TA_ABORT); 734 blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT);
734} 735}
735 736
736static void blk_add_trace_rq_insert(void *ignore, 737static void blk_add_trace_rq_insert(void *ignore,
737 struct request_queue *q, struct request *rq) 738 struct request_queue *q, struct request *rq)
738{ 739{
739 blk_add_trace_rq(q, rq, BLK_TA_INSERT); 740 blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT);
740} 741}
741 742
742static void blk_add_trace_rq_issue(void *ignore, 743static void blk_add_trace_rq_issue(void *ignore,
743 struct request_queue *q, struct request *rq) 744 struct request_queue *q, struct request *rq)
744{ 745{
745 blk_add_trace_rq(q, rq, BLK_TA_ISSUE); 746 blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE);
746} 747}
747 748
748static void blk_add_trace_rq_requeue(void *ignore, 749static void blk_add_trace_rq_requeue(void *ignore,
749 struct request_queue *q, 750 struct request_queue *q,
750 struct request *rq) 751 struct request *rq)
751{ 752{
752 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); 753 blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE);
753} 754}
754 755
755static void blk_add_trace_rq_complete(void *ignore, 756static void blk_add_trace_rq_complete(void *ignore,
756 struct request_queue *q, 757 struct request_queue *q,
757 struct request *rq) 758 struct request *rq,
759 unsigned int nr_bytes)
758{ 760{
759 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); 761 blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE);
760} 762}
761 763
762/** 764/**
@@ -1427,7 +1429,8 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
1427 return print_one_line(iter, true); 1429 return print_one_line(iter, true);
1428} 1430}
1429 1431
1430static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set) 1432static int
1433blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
1431{ 1434{
1432 /* don't output context-info for blk_classic output */ 1435 /* don't output context-info for blk_classic output */
1433 if (bit == TRACE_BLK_OPT_CLASSIC) { 1436 if (bit == TRACE_BLK_OPT_CLASSIC) {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cd7f76d1eb86..1fd4b9479210 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -237,14 +237,13 @@ static int control_ops_alloc(struct ftrace_ops *ops)
237 return 0; 237 return 0;
238} 238}
239 239
240static void control_ops_free(struct ftrace_ops *ops)
241{
242 free_percpu(ops->disabled);
243}
244
245static void update_global_ops(void) 240static void update_global_ops(void)
246{ 241{
247 ftrace_func_t func; 242 ftrace_func_t func = ftrace_global_list_func;
243 void *private = NULL;
244
245 /* The list has its own recursion protection. */
246 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
248 247
249 /* 248 /*
250 * If there's only one function registered, then call that 249 * If there's only one function registered, then call that
@@ -254,23 +253,17 @@ static void update_global_ops(void)
254 if (ftrace_global_list == &ftrace_list_end || 253 if (ftrace_global_list == &ftrace_list_end ||
255 ftrace_global_list->next == &ftrace_list_end) { 254 ftrace_global_list->next == &ftrace_list_end) {
256 func = ftrace_global_list->func; 255 func = ftrace_global_list->func;
256 private = ftrace_global_list->private;
257 /* 257 /*
258 * As we are calling the function directly. 258 * As we are calling the function directly.
259 * If it does not have recursion protection, 259 * If it does not have recursion protection,
260 * the function_trace_op needs to be updated 260 * the function_trace_op needs to be updated
261 * accordingly. 261 * accordingly.
262 */ 262 */
263 if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) 263 if (!(ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE))
264 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
265 else
266 global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE; 264 global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
267 } else {
268 func = ftrace_global_list_func;
269 /* The list has its own recursion protection. */
270 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
271 } 265 }
272 266
273
274 /* If we filter on pids, update to use the pid function */ 267 /* If we filter on pids, update to use the pid function */
275 if (!list_empty(&ftrace_pids)) { 268 if (!list_empty(&ftrace_pids)) {
276 set_ftrace_pid_function(func); 269 set_ftrace_pid_function(func);
@@ -278,6 +271,7 @@ static void update_global_ops(void)
278 } 271 }
279 272
280 global_ops.func = func; 273 global_ops.func = func;
274 global_ops.private = private;
281} 275}
282 276
283static void ftrace_sync(struct work_struct *work) 277static void ftrace_sync(struct work_struct *work)
@@ -437,6 +431,9 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
437 431
438static int __register_ftrace_function(struct ftrace_ops *ops) 432static int __register_ftrace_function(struct ftrace_ops *ops)
439{ 433{
434 if (ops->flags & FTRACE_OPS_FL_DELETED)
435 return -EINVAL;
436
440 if (FTRACE_WARN_ON(ops == &global_ops)) 437 if (FTRACE_WARN_ON(ops == &global_ops))
441 return -EINVAL; 438 return -EINVAL;
442 439
@@ -1172,8 +1169,6 @@ struct ftrace_page {
1172 int size; 1169 int size;
1173}; 1170};
1174 1171
1175static struct ftrace_page *ftrace_new_pgs;
1176
1177#define ENTRY_SIZE sizeof(struct dyn_ftrace) 1172#define ENTRY_SIZE sizeof(struct dyn_ftrace)
1178#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) 1173#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
1179 1174
@@ -1560,7 +1555,7 @@ unsigned long ftrace_location(unsigned long ip)
1560 * the function tracer. It checks the ftrace internal tables to 1555 * the function tracer. It checks the ftrace internal tables to
1561 * determine if the address belongs or not. 1556 * determine if the address belongs or not.
1562 */ 1557 */
1563int ftrace_text_reserved(void *start, void *end) 1558int ftrace_text_reserved(const void *start, const void *end)
1564{ 1559{
1565 unsigned long ret; 1560 unsigned long ret;
1566 1561
@@ -1994,6 +1989,7 @@ int __weak ftrace_arch_code_modify_post_process(void)
1994void ftrace_modify_all_code(int command) 1989void ftrace_modify_all_code(int command)
1995{ 1990{
1996 int update = command & FTRACE_UPDATE_TRACE_FUNC; 1991 int update = command & FTRACE_UPDATE_TRACE_FUNC;
1992 int err = 0;
1997 1993
1998 /* 1994 /*
1999 * If the ftrace_caller calls a ftrace_ops func directly, 1995 * If the ftrace_caller calls a ftrace_ops func directly,
@@ -2005,8 +2001,11 @@ void ftrace_modify_all_code(int command)
2005 * to make sure the ops are having the right functions 2001 * to make sure the ops are having the right functions
2006 * traced. 2002 * traced.
2007 */ 2003 */
2008 if (update) 2004 if (update) {
2009 ftrace_update_ftrace_func(ftrace_ops_list_func); 2005 err = ftrace_update_ftrace_func(ftrace_ops_list_func);
2006 if (FTRACE_WARN_ON(err))
2007 return;
2008 }
2010 2009
2011 if (command & FTRACE_UPDATE_CALLS) 2010 if (command & FTRACE_UPDATE_CALLS)
2012 ftrace_replace_code(1); 2011 ftrace_replace_code(1);
@@ -2019,13 +2018,16 @@ void ftrace_modify_all_code(int command)
2019 /* If irqs are disabled, we are in stop machine */ 2018 /* If irqs are disabled, we are in stop machine */
2020 if (!irqs_disabled()) 2019 if (!irqs_disabled())
2021 smp_call_function(ftrace_sync_ipi, NULL, 1); 2020 smp_call_function(ftrace_sync_ipi, NULL, 1);
2022 ftrace_update_ftrace_func(ftrace_trace_function); 2021 err = ftrace_update_ftrace_func(ftrace_trace_function);
2022 if (FTRACE_WARN_ON(err))
2023 return;
2023 } 2024 }
2024 2025
2025 if (command & FTRACE_START_FUNC_RET) 2026 if (command & FTRACE_START_FUNC_RET)
2026 ftrace_enable_ftrace_graph_caller(); 2027 err = ftrace_enable_ftrace_graph_caller();
2027 else if (command & FTRACE_STOP_FUNC_RET) 2028 else if (command & FTRACE_STOP_FUNC_RET)
2028 ftrace_disable_ftrace_graph_caller(); 2029 err = ftrace_disable_ftrace_graph_caller();
2030 FTRACE_WARN_ON(err);
2029} 2031}
2030 2032
2031static int __ftrace_modify_code(void *data) 2033static int __ftrace_modify_code(void *data)
@@ -2093,6 +2095,11 @@ static ftrace_func_t saved_ftrace_func;
2093static int ftrace_start_up; 2095static int ftrace_start_up;
2094static int global_start_up; 2096static int global_start_up;
2095 2097
2098static void control_ops_free(struct ftrace_ops *ops)
2099{
2100 free_percpu(ops->disabled);
2101}
2102
2096static void ftrace_startup_enable(int command) 2103static void ftrace_startup_enable(int command)
2097{ 2104{
2098 if (saved_ftrace_func != ftrace_trace_function) { 2105 if (saved_ftrace_func != ftrace_trace_function) {
@@ -2244,7 +2251,6 @@ static void ftrace_shutdown_sysctl(void)
2244} 2251}
2245 2252
2246static cycle_t ftrace_update_time; 2253static cycle_t ftrace_update_time;
2247static unsigned long ftrace_update_cnt;
2248unsigned long ftrace_update_tot_cnt; 2254unsigned long ftrace_update_tot_cnt;
2249 2255
2250static inline int ops_traces_mod(struct ftrace_ops *ops) 2256static inline int ops_traces_mod(struct ftrace_ops *ops)
@@ -2300,11 +2306,12 @@ static int referenced_filters(struct dyn_ftrace *rec)
2300 return cnt; 2306 return cnt;
2301} 2307}
2302 2308
2303static int ftrace_update_code(struct module *mod) 2309static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
2304{ 2310{
2305 struct ftrace_page *pg; 2311 struct ftrace_page *pg;
2306 struct dyn_ftrace *p; 2312 struct dyn_ftrace *p;
2307 cycle_t start, stop; 2313 cycle_t start, stop;
2314 unsigned long update_cnt = 0;
2308 unsigned long ref = 0; 2315 unsigned long ref = 0;
2309 bool test = false; 2316 bool test = false;
2310 int i; 2317 int i;
@@ -2330,9 +2337,8 @@ static int ftrace_update_code(struct module *mod)
2330 } 2337 }
2331 2338
2332 start = ftrace_now(raw_smp_processor_id()); 2339 start = ftrace_now(raw_smp_processor_id());
2333 ftrace_update_cnt = 0;
2334 2340
2335 for (pg = ftrace_new_pgs; pg; pg = pg->next) { 2341 for (pg = new_pgs; pg; pg = pg->next) {
2336 2342
2337 for (i = 0; i < pg->index; i++) { 2343 for (i = 0; i < pg->index; i++) {
2338 int cnt = ref; 2344 int cnt = ref;
@@ -2353,7 +2359,7 @@ static int ftrace_update_code(struct module *mod)
2353 if (!ftrace_code_disable(mod, p)) 2359 if (!ftrace_code_disable(mod, p))
2354 break; 2360 break;
2355 2361
2356 ftrace_update_cnt++; 2362 update_cnt++;
2357 2363
2358 /* 2364 /*
2359 * If the tracing is enabled, go ahead and enable the record. 2365 * If the tracing is enabled, go ahead and enable the record.
@@ -2372,11 +2378,9 @@ static int ftrace_update_code(struct module *mod)
2372 } 2378 }
2373 } 2379 }
2374 2380
2375 ftrace_new_pgs = NULL;
2376
2377 stop = ftrace_now(raw_smp_processor_id()); 2381 stop = ftrace_now(raw_smp_processor_id());
2378 ftrace_update_time = stop - start; 2382 ftrace_update_time = stop - start;
2379 ftrace_update_tot_cnt += ftrace_update_cnt; 2383 ftrace_update_tot_cnt += update_cnt;
2380 2384
2381 return 0; 2385 return 0;
2382} 2386}
@@ -2468,22 +2472,6 @@ ftrace_allocate_pages(unsigned long num_to_init)
2468 return NULL; 2472 return NULL;
2469} 2473}
2470 2474
2471static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
2472{
2473 int cnt;
2474
2475 if (!num_to_init) {
2476 pr_info("ftrace: No functions to be traced?\n");
2477 return -1;
2478 }
2479
2480 cnt = num_to_init / ENTRIES_PER_PAGE;
2481 pr_info("ftrace: allocating %ld entries in %d pages\n",
2482 num_to_init, cnt + 1);
2483
2484 return 0;
2485}
2486
2487#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 2475#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
2488 2476
2489struct ftrace_iterator { 2477struct ftrace_iterator {
@@ -2871,7 +2859,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
2871static int 2859static int
2872ftrace_filter_open(struct inode *inode, struct file *file) 2860ftrace_filter_open(struct inode *inode, struct file *file)
2873{ 2861{
2874 return ftrace_regex_open(&global_ops, 2862 struct ftrace_ops *ops = inode->i_private;
2863
2864 return ftrace_regex_open(ops,
2875 FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, 2865 FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
2876 inode, file); 2866 inode, file);
2877} 2867}
@@ -2879,7 +2869,9 @@ ftrace_filter_open(struct inode *inode, struct file *file)
2879static int 2869static int
2880ftrace_notrace_open(struct inode *inode, struct file *file) 2870ftrace_notrace_open(struct inode *inode, struct file *file)
2881{ 2871{
2882 return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE, 2872 struct ftrace_ops *ops = inode->i_private;
2873
2874 return ftrace_regex_open(ops, FTRACE_ITER_NOTRACE,
2883 inode, file); 2875 inode, file);
2884} 2876}
2885 2877
@@ -4109,6 +4101,36 @@ static const struct file_operations ftrace_graph_notrace_fops = {
4109}; 4101};
4110#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 4102#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
4111 4103
4104void ftrace_create_filter_files(struct ftrace_ops *ops,
4105 struct dentry *parent)
4106{
4107
4108 trace_create_file("set_ftrace_filter", 0644, parent,
4109 ops, &ftrace_filter_fops);
4110
4111 trace_create_file("set_ftrace_notrace", 0644, parent,
4112 ops, &ftrace_notrace_fops);
4113}
4114
4115/*
4116 * The name "destroy_filter_files" is really a misnomer. Although
4117 * in the future, it may actualy delete the files, but this is
4118 * really intended to make sure the ops passed in are disabled
4119 * and that when this function returns, the caller is free to
4120 * free the ops.
4121 *
4122 * The "destroy" name is only to match the "create" name that this
4123 * should be paired with.
4124 */
4125void ftrace_destroy_filter_files(struct ftrace_ops *ops)
4126{
4127 mutex_lock(&ftrace_lock);
4128 if (ops->flags & FTRACE_OPS_FL_ENABLED)
4129 ftrace_shutdown(ops, 0);
4130 ops->flags |= FTRACE_OPS_FL_DELETED;
4131 mutex_unlock(&ftrace_lock);
4132}
4133
4112static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) 4134static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
4113{ 4135{
4114 4136
@@ -4118,11 +4140,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
4118 trace_create_file("enabled_functions", 0444, 4140 trace_create_file("enabled_functions", 0444,
4119 d_tracer, NULL, &ftrace_enabled_fops); 4141 d_tracer, NULL, &ftrace_enabled_fops);
4120 4142
4121 trace_create_file("set_ftrace_filter", 0644, d_tracer, 4143 ftrace_create_filter_files(&global_ops, d_tracer);
4122 NULL, &ftrace_filter_fops);
4123
4124 trace_create_file("set_ftrace_notrace", 0644, d_tracer,
4125 NULL, &ftrace_notrace_fops);
4126 4144
4127#ifdef CONFIG_FUNCTION_GRAPH_TRACER 4145#ifdef CONFIG_FUNCTION_GRAPH_TRACER
4128 trace_create_file("set_graph_function", 0444, d_tracer, 4146 trace_create_file("set_graph_function", 0444, d_tracer,
@@ -4238,9 +4256,6 @@ static int ftrace_process_locs(struct module *mod,
4238 /* Assign the last page to ftrace_pages */ 4256 /* Assign the last page to ftrace_pages */
4239 ftrace_pages = pg; 4257 ftrace_pages = pg;
4240 4258
4241 /* These new locations need to be initialized */
4242 ftrace_new_pgs = start_pg;
4243
4244 /* 4259 /*
4245 * We only need to disable interrupts on start up 4260 * We only need to disable interrupts on start up
4246 * because we are modifying code that an interrupt 4261 * because we are modifying code that an interrupt
@@ -4251,7 +4266,7 @@ static int ftrace_process_locs(struct module *mod,
4251 */ 4266 */
4252 if (!mod) 4267 if (!mod)
4253 local_irq_save(flags); 4268 local_irq_save(flags);
4254 ftrace_update_code(mod); 4269 ftrace_update_code(mod, start_pg);
4255 if (!mod) 4270 if (!mod)
4256 local_irq_restore(flags); 4271 local_irq_restore(flags);
4257 ret = 0; 4272 ret = 0;
@@ -4360,30 +4375,27 @@ struct notifier_block ftrace_module_exit_nb = {
4360 .priority = INT_MIN, /* Run after anything that can remove kprobes */ 4375 .priority = INT_MIN, /* Run after anything that can remove kprobes */
4361}; 4376};
4362 4377
4363extern unsigned long __start_mcount_loc[];
4364extern unsigned long __stop_mcount_loc[];
4365
4366void __init ftrace_init(void) 4378void __init ftrace_init(void)
4367{ 4379{
4368 unsigned long count, addr, flags; 4380 extern unsigned long __start_mcount_loc[];
4381 extern unsigned long __stop_mcount_loc[];
4382 unsigned long count, flags;
4369 int ret; 4383 int ret;
4370 4384
4371 /* Keep the ftrace pointer to the stub */
4372 addr = (unsigned long)ftrace_stub;
4373
4374 local_irq_save(flags); 4385 local_irq_save(flags);
4375 ftrace_dyn_arch_init(&addr); 4386 ret = ftrace_dyn_arch_init();
4376 local_irq_restore(flags); 4387 local_irq_restore(flags);
4377 4388 if (ret)
4378 /* ftrace_dyn_arch_init places the return code in addr */
4379 if (addr)
4380 goto failed; 4389 goto failed;
4381 4390
4382 count = __stop_mcount_loc - __start_mcount_loc; 4391 count = __stop_mcount_loc - __start_mcount_loc;
4383 4392 if (!count) {
4384 ret = ftrace_dyn_table_alloc(count); 4393 pr_info("ftrace: No functions to be traced?\n");
4385 if (ret)
4386 goto failed; 4394 goto failed;
4395 }
4396
4397 pr_info("ftrace: allocating %ld entries in %ld pages\n",
4398 count, count / ENTRIES_PER_PAGE + 1);
4387 4399
4388 last_ftrace_enabled = ftrace_enabled = 1; 4400 last_ftrace_enabled = ftrace_enabled = 1;
4389 4401
@@ -4431,7 +4443,13 @@ static inline void ftrace_startup_enable(int command) { }
4431 (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ 4443 (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
4432 ___ret; \ 4444 ___ret; \
4433 }) 4445 })
4434# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops) 4446# define ftrace_shutdown(ops, command) \
4447 ({ \
4448 int ___ret = __unregister_ftrace_function(ops); \
4449 if (!___ret) \
4450 (ops)->flags &= ~FTRACE_OPS_FL_ENABLED; \
4451 ___ret; \
4452 })
4435 4453
4436# define ftrace_startup_sysctl() do { } while (0) 4454# define ftrace_startup_sysctl() do { } while (0)
4437# define ftrace_shutdown_sysctl() do { } while (0) 4455# define ftrace_shutdown_sysctl() do { } while (0)
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a5457d577b98..0434ff1b808e 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -40,8 +40,8 @@ static int write_iteration = 50;
40module_param(write_iteration, uint, 0644); 40module_param(write_iteration, uint, 0644);
41MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); 41MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
42 42
43static int producer_nice = 19; 43static int producer_nice = MAX_NICE;
44static int consumer_nice = 19; 44static int consumer_nice = MAX_NICE;
45 45
46static int producer_fifo = -1; 46static int producer_fifo = -1;
47static int consumer_fifo = -1; 47static int consumer_fifo = -1;
@@ -308,7 +308,7 @@ static void ring_buffer_producer(void)
308 308
309 /* Let the user know that the test is running at low priority */ 309 /* Let the user know that the test is running at low priority */
310 if (producer_fifo < 0 && consumer_fifo < 0 && 310 if (producer_fifo < 0 && consumer_fifo < 0 &&
311 producer_nice == 19 && consumer_nice == 19) 311 producer_nice == MAX_NICE && consumer_nice == MAX_NICE)
312 trace_printk("WARNING!!! This test is running at lowest priority.\n"); 312 trace_printk("WARNING!!! This test is running at lowest priority.\n");
313 313
314 trace_printk("Time: %lld (usecs)\n", time); 314 trace_printk("Time: %lld (usecs)\n", time);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 815c878f409b..9be67c5e5b0f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -73,7 +73,8 @@ static struct tracer_flags dummy_tracer_flags = {
73 .opts = dummy_tracer_opt 73 .opts = dummy_tracer_opt
74}; 74};
75 75
76static int dummy_set_flag(u32 old_flags, u32 bit, int set) 76static int
77dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
77{ 78{
78 return 0; 79 return 0;
79} 80}
@@ -118,7 +119,7 @@ enum ftrace_dump_mode ftrace_dump_on_oops;
118/* When set, tracing will stop when a WARN*() is hit */ 119/* When set, tracing will stop when a WARN*() is hit */
119int __disable_trace_on_warning; 120int __disable_trace_on_warning;
120 121
121static int tracing_set_tracer(const char *buf); 122static int tracing_set_tracer(struct trace_array *tr, const char *buf);
122 123
123#define MAX_TRACER_SIZE 100 124#define MAX_TRACER_SIZE 100
124static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; 125static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
@@ -180,6 +181,17 @@ static int __init set_trace_boot_options(char *str)
180} 181}
181__setup("trace_options=", set_trace_boot_options); 182__setup("trace_options=", set_trace_boot_options);
182 183
184static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata;
185static char *trace_boot_clock __initdata;
186
187static int __init set_trace_boot_clock(char *str)
188{
189 strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE);
190 trace_boot_clock = trace_boot_clock_buf;
191 return 0;
192}
193__setup("trace_clock=", set_trace_boot_clock);
194
183 195
184unsigned long long ns2usecs(cycle_t nsec) 196unsigned long long ns2usecs(cycle_t nsec)
185{ 197{
@@ -1230,7 +1242,7 @@ int register_tracer(struct tracer *type)
1230 1242
1231 printk(KERN_INFO "Starting tracer '%s'\n", type->name); 1243 printk(KERN_INFO "Starting tracer '%s'\n", type->name);
1232 /* Do we want this tracer to start on bootup? */ 1244 /* Do we want this tracer to start on bootup? */
1233 tracing_set_tracer(type->name); 1245 tracing_set_tracer(&global_trace, type->name);
1234 default_bootup_tracer = NULL; 1246 default_bootup_tracer = NULL;
1235 /* disable other selftests, since this will break it. */ 1247 /* disable other selftests, since this will break it. */
1236 tracing_selftest_disabled = true; 1248 tracing_selftest_disabled = true;
@@ -1600,15 +1612,31 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer,
1600} 1612}
1601EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); 1613EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
1602 1614
1615static struct ring_buffer *temp_buffer;
1616
1603struct ring_buffer_event * 1617struct ring_buffer_event *
1604trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, 1618trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
1605 struct ftrace_event_file *ftrace_file, 1619 struct ftrace_event_file *ftrace_file,
1606 int type, unsigned long len, 1620 int type, unsigned long len,
1607 unsigned long flags, int pc) 1621 unsigned long flags, int pc)
1608{ 1622{
1623 struct ring_buffer_event *entry;
1624
1609 *current_rb = ftrace_file->tr->trace_buffer.buffer; 1625 *current_rb = ftrace_file->tr->trace_buffer.buffer;
1610 return trace_buffer_lock_reserve(*current_rb, 1626 entry = trace_buffer_lock_reserve(*current_rb,
1611 type, len, flags, pc); 1627 type, len, flags, pc);
1628 /*
1629 * If tracing is off, but we have triggers enabled
1630 * we still need to look at the event data. Use the temp_buffer
1631 * to store the trace event for the tigger to use. It's recusive
1632 * safe and will not be recorded anywhere.
1633 */
1634 if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) {
1635 *current_rb = temp_buffer;
1636 entry = trace_buffer_lock_reserve(*current_rb,
1637 type, len, flags, pc);
1638 }
1639 return entry;
1612} 1640}
1613EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); 1641EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
1614 1642
@@ -3121,27 +3149,52 @@ static int tracing_open(struct inode *inode, struct file *file)
3121 return ret; 3149 return ret;
3122} 3150}
3123 3151
3152/*
3153 * Some tracers are not suitable for instance buffers.
3154 * A tracer is always available for the global array (toplevel)
3155 * or if it explicitly states that it is.
3156 */
3157static bool
3158trace_ok_for_array(struct tracer *t, struct trace_array *tr)
3159{
3160 return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances;
3161}
3162
3163/* Find the next tracer that this trace array may use */
3164static struct tracer *
3165get_tracer_for_array(struct trace_array *tr, struct tracer *t)
3166{
3167 while (t && !trace_ok_for_array(t, tr))
3168 t = t->next;
3169
3170 return t;
3171}
3172
3124static void * 3173static void *
3125t_next(struct seq_file *m, void *v, loff_t *pos) 3174t_next(struct seq_file *m, void *v, loff_t *pos)
3126{ 3175{
3176 struct trace_array *tr = m->private;
3127 struct tracer *t = v; 3177 struct tracer *t = v;
3128 3178
3129 (*pos)++; 3179 (*pos)++;
3130 3180
3131 if (t) 3181 if (t)
3132 t = t->next; 3182 t = get_tracer_for_array(tr, t->next);
3133 3183
3134 return t; 3184 return t;
3135} 3185}
3136 3186
3137static void *t_start(struct seq_file *m, loff_t *pos) 3187static void *t_start(struct seq_file *m, loff_t *pos)
3138{ 3188{
3189 struct trace_array *tr = m->private;
3139 struct tracer *t; 3190 struct tracer *t;
3140 loff_t l = 0; 3191 loff_t l = 0;
3141 3192
3142 mutex_lock(&trace_types_lock); 3193 mutex_lock(&trace_types_lock);
3143 for (t = trace_types; t && l < *pos; t = t_next(m, t, &l)) 3194
3144 ; 3195 t = get_tracer_for_array(tr, trace_types);
3196 for (; t && l < *pos; t = t_next(m, t, &l))
3197 ;
3145 3198
3146 return t; 3199 return t;
3147} 3200}
@@ -3176,10 +3229,21 @@ static const struct seq_operations show_traces_seq_ops = {
3176 3229
3177static int show_traces_open(struct inode *inode, struct file *file) 3230static int show_traces_open(struct inode *inode, struct file *file)
3178{ 3231{
3232 struct trace_array *tr = inode->i_private;
3233 struct seq_file *m;
3234 int ret;
3235
3179 if (tracing_disabled) 3236 if (tracing_disabled)
3180 return -ENODEV; 3237 return -ENODEV;
3181 3238
3182 return seq_open(file, &show_traces_seq_ops); 3239 ret = seq_open(file, &show_traces_seq_ops);
3240 if (ret)
3241 return ret;
3242
3243 m = file->private_data;
3244 m->private = tr;
3245
3246 return 0;
3183} 3247}
3184 3248
3185static ssize_t 3249static ssize_t
@@ -3339,13 +3403,14 @@ static int tracing_trace_options_show(struct seq_file *m, void *v)
3339 return 0; 3403 return 0;
3340} 3404}
3341 3405
3342static int __set_tracer_option(struct tracer *trace, 3406static int __set_tracer_option(struct trace_array *tr,
3343 struct tracer_flags *tracer_flags, 3407 struct tracer_flags *tracer_flags,
3344 struct tracer_opt *opts, int neg) 3408 struct tracer_opt *opts, int neg)
3345{ 3409{
3410 struct tracer *trace = tr->current_trace;
3346 int ret; 3411 int ret;
3347 3412
3348 ret = trace->set_flag(tracer_flags->val, opts->bit, !neg); 3413 ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg);
3349 if (ret) 3414 if (ret)
3350 return ret; 3415 return ret;
3351 3416
@@ -3357,8 +3422,9 @@ static int __set_tracer_option(struct tracer *trace,
3357} 3422}
3358 3423
3359/* Try to assign a tracer specific option */ 3424/* Try to assign a tracer specific option */
3360static int set_tracer_option(struct tracer *trace, char *cmp, int neg) 3425static int set_tracer_option(struct trace_array *tr, char *cmp, int neg)
3361{ 3426{
3427 struct tracer *trace = tr->current_trace;
3362 struct tracer_flags *tracer_flags = trace->flags; 3428 struct tracer_flags *tracer_flags = trace->flags;
3363 struct tracer_opt *opts = NULL; 3429 struct tracer_opt *opts = NULL;
3364 int i; 3430 int i;
@@ -3367,8 +3433,7 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
3367 opts = &tracer_flags->opts[i]; 3433 opts = &tracer_flags->opts[i];
3368 3434
3369 if (strcmp(cmp, opts->name) == 0) 3435 if (strcmp(cmp, opts->name) == 0)
3370 return __set_tracer_option(trace, trace->flags, 3436 return __set_tracer_option(tr, trace->flags, opts, neg);
3371 opts, neg);
3372 } 3437 }
3373 3438
3374 return -EINVAL; 3439 return -EINVAL;
@@ -3391,7 +3456,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
3391 3456
3392 /* Give the tracer a chance to approve the change */ 3457 /* Give the tracer a chance to approve the change */
3393 if (tr->current_trace->flag_changed) 3458 if (tr->current_trace->flag_changed)
3394 if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled)) 3459 if (tr->current_trace->flag_changed(tr, mask, !!enabled))
3395 return -EINVAL; 3460 return -EINVAL;
3396 3461
3397 if (enabled) 3462 if (enabled)
@@ -3440,7 +3505,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
3440 3505
3441 /* If no option could be set, test the specific tracer options */ 3506 /* If no option could be set, test the specific tracer options */
3442 if (!trace_options[i]) 3507 if (!trace_options[i])
3443 ret = set_tracer_option(tr->current_trace, cmp, neg); 3508 ret = set_tracer_option(tr, cmp, neg);
3444 3509
3445 mutex_unlock(&trace_types_lock); 3510 mutex_unlock(&trace_types_lock);
3446 3511
@@ -3869,10 +3934,26 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
3869static void 3934static void
3870destroy_trace_option_files(struct trace_option_dentry *topts); 3935destroy_trace_option_files(struct trace_option_dentry *topts);
3871 3936
3872static int tracing_set_tracer(const char *buf) 3937/*
3938 * Used to clear out the tracer before deletion of an instance.
3939 * Must have trace_types_lock held.
3940 */
3941static void tracing_set_nop(struct trace_array *tr)
3942{
3943 if (tr->current_trace == &nop_trace)
3944 return;
3945
3946 tr->current_trace->enabled--;
3947
3948 if (tr->current_trace->reset)
3949 tr->current_trace->reset(tr);
3950
3951 tr->current_trace = &nop_trace;
3952}
3953
3954static int tracing_set_tracer(struct trace_array *tr, const char *buf)
3873{ 3955{
3874 static struct trace_option_dentry *topts; 3956 static struct trace_option_dentry *topts;
3875 struct trace_array *tr = &global_trace;
3876 struct tracer *t; 3957 struct tracer *t;
3877#ifdef CONFIG_TRACER_MAX_TRACE 3958#ifdef CONFIG_TRACER_MAX_TRACE
3878 bool had_max_tr; 3959 bool had_max_tr;
@@ -3900,9 +3981,15 @@ static int tracing_set_tracer(const char *buf)
3900 if (t == tr->current_trace) 3981 if (t == tr->current_trace)
3901 goto out; 3982 goto out;
3902 3983
3984 /* Some tracers are only allowed for the top level buffer */
3985 if (!trace_ok_for_array(t, tr)) {
3986 ret = -EINVAL;
3987 goto out;
3988 }
3989
3903 trace_branch_disable(); 3990 trace_branch_disable();
3904 3991
3905 tr->current_trace->enabled = false; 3992 tr->current_trace->enabled--;
3906 3993
3907 if (tr->current_trace->reset) 3994 if (tr->current_trace->reset)
3908 tr->current_trace->reset(tr); 3995 tr->current_trace->reset(tr);
@@ -3925,9 +4012,11 @@ static int tracing_set_tracer(const char *buf)
3925 free_snapshot(tr); 4012 free_snapshot(tr);
3926 } 4013 }
3927#endif 4014#endif
3928 destroy_trace_option_files(topts); 4015 /* Currently, only the top instance has options */
3929 4016 if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
3930 topts = create_trace_option_files(tr, t); 4017 destroy_trace_option_files(topts);
4018 topts = create_trace_option_files(tr, t);
4019 }
3931 4020
3932#ifdef CONFIG_TRACER_MAX_TRACE 4021#ifdef CONFIG_TRACER_MAX_TRACE
3933 if (t->use_max_tr && !had_max_tr) { 4022 if (t->use_max_tr && !had_max_tr) {
@@ -3944,7 +4033,7 @@ static int tracing_set_tracer(const char *buf)
3944 } 4033 }
3945 4034
3946 tr->current_trace = t; 4035 tr->current_trace = t;
3947 tr->current_trace->enabled = true; 4036 tr->current_trace->enabled++;
3948 trace_branch_enable(tr); 4037 trace_branch_enable(tr);
3949 out: 4038 out:
3950 mutex_unlock(&trace_types_lock); 4039 mutex_unlock(&trace_types_lock);
@@ -3956,6 +4045,7 @@ static ssize_t
3956tracing_set_trace_write(struct file *filp, const char __user *ubuf, 4045tracing_set_trace_write(struct file *filp, const char __user *ubuf,
3957 size_t cnt, loff_t *ppos) 4046 size_t cnt, loff_t *ppos)
3958{ 4047{
4048 struct trace_array *tr = filp->private_data;
3959 char buf[MAX_TRACER_SIZE+1]; 4049 char buf[MAX_TRACER_SIZE+1];
3960 int i; 4050 int i;
3961 size_t ret; 4051 size_t ret;
@@ -3975,7 +4065,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
3975 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) 4065 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
3976 buf[i] = 0; 4066 buf[i] = 0;
3977 4067
3978 err = tracing_set_tracer(buf); 4068 err = tracing_set_tracer(tr, buf);
3979 if (err) 4069 if (err)
3980 return err; 4070 return err;
3981 4071
@@ -4683,25 +4773,10 @@ static int tracing_clock_show(struct seq_file *m, void *v)
4683 return 0; 4773 return 0;
4684} 4774}
4685 4775
4686static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, 4776static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
4687 size_t cnt, loff_t *fpos)
4688{ 4777{
4689 struct seq_file *m = filp->private_data;
4690 struct trace_array *tr = m->private;
4691 char buf[64];
4692 const char *clockstr;
4693 int i; 4778 int i;
4694 4779
4695 if (cnt >= sizeof(buf))
4696 return -EINVAL;
4697
4698 if (copy_from_user(&buf, ubuf, cnt))
4699 return -EFAULT;
4700
4701 buf[cnt] = 0;
4702
4703 clockstr = strstrip(buf);
4704
4705 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { 4780 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) {
4706 if (strcmp(trace_clocks[i].name, clockstr) == 0) 4781 if (strcmp(trace_clocks[i].name, clockstr) == 0)
4707 break; 4782 break;
@@ -4729,6 +4804,32 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4729 4804
4730 mutex_unlock(&trace_types_lock); 4805 mutex_unlock(&trace_types_lock);
4731 4806
4807 return 0;
4808}
4809
4810static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4811 size_t cnt, loff_t *fpos)
4812{
4813 struct seq_file *m = filp->private_data;
4814 struct trace_array *tr = m->private;
4815 char buf[64];
4816 const char *clockstr;
4817 int ret;
4818
4819 if (cnt >= sizeof(buf))
4820 return -EINVAL;
4821
4822 if (copy_from_user(&buf, ubuf, cnt))
4823 return -EFAULT;
4824
4825 buf[cnt] = 0;
4826
4827 clockstr = strstrip(buf);
4828
4829 ret = tracing_set_clock(tr, clockstr);
4830 if (ret)
4831 return ret;
4832
4732 *fpos += cnt; 4833 *fpos += cnt;
4733 4834
4734 return cnt; 4835 return cnt;
@@ -5689,7 +5790,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
5689 5790
5690 if (!!(topt->flags->val & topt->opt->bit) != val) { 5791 if (!!(topt->flags->val & topt->opt->bit) != val) {
5691 mutex_lock(&trace_types_lock); 5792 mutex_lock(&trace_types_lock);
5692 ret = __set_tracer_option(topt->tr->current_trace, topt->flags, 5793 ret = __set_tracer_option(topt->tr, topt->flags,
5693 topt->opt, !val); 5794 topt->opt, !val);
5694 mutex_unlock(&trace_types_lock); 5795 mutex_unlock(&trace_types_lock);
5695 if (ret) 5796 if (ret)
@@ -6096,7 +6197,9 @@ static int instance_delete(const char *name)
6096 6197
6097 list_del(&tr->list); 6198 list_del(&tr->list);
6098 6199
6200 tracing_set_nop(tr);
6099 event_trace_del_tracer(tr); 6201 event_trace_del_tracer(tr);
6202 ftrace_destroy_function_files(tr);
6100 debugfs_remove_recursive(tr->dir); 6203 debugfs_remove_recursive(tr->dir);
6101 free_percpu(tr->trace_buffer.data); 6204 free_percpu(tr->trace_buffer.data);
6102 ring_buffer_free(tr->trace_buffer.buffer); 6205 ring_buffer_free(tr->trace_buffer.buffer);
@@ -6191,6 +6294,12 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6191{ 6294{
6192 int cpu; 6295 int cpu;
6193 6296
6297 trace_create_file("available_tracers", 0444, d_tracer,
6298 tr, &show_traces_fops);
6299
6300 trace_create_file("current_tracer", 0644, d_tracer,
6301 tr, &set_tracer_fops);
6302
6194 trace_create_file("tracing_cpumask", 0644, d_tracer, 6303 trace_create_file("tracing_cpumask", 0644, d_tracer,
6195 tr, &tracing_cpumask_fops); 6304 tr, &tracing_cpumask_fops);
6196 6305
@@ -6221,6 +6330,9 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6221 trace_create_file("tracing_on", 0644, d_tracer, 6330 trace_create_file("tracing_on", 0644, d_tracer,
6222 tr, &rb_simple_fops); 6331 tr, &rb_simple_fops);
6223 6332
6333 if (ftrace_create_function_files(tr, d_tracer))
6334 WARN(1, "Could not allocate function filter files");
6335
6224#ifdef CONFIG_TRACER_SNAPSHOT 6336#ifdef CONFIG_TRACER_SNAPSHOT
6225 trace_create_file("snapshot", 0644, d_tracer, 6337 trace_create_file("snapshot", 0644, d_tracer,
6226 tr, &snapshot_fops); 6338 tr, &snapshot_fops);
@@ -6243,12 +6355,6 @@ static __init int tracer_init_debugfs(void)
6243 6355
6244 init_tracer_debugfs(&global_trace, d_tracer); 6356 init_tracer_debugfs(&global_trace, d_tracer);
6245 6357
6246 trace_create_file("available_tracers", 0444, d_tracer,
6247 &global_trace, &show_traces_fops);
6248
6249 trace_create_file("current_tracer", 0644, d_tracer,
6250 &global_trace, &set_tracer_fops);
6251
6252#ifdef CONFIG_TRACER_MAX_TRACE 6358#ifdef CONFIG_TRACER_MAX_TRACE
6253 trace_create_file("tracing_max_latency", 0644, d_tracer, 6359 trace_create_file("tracing_max_latency", 0644, d_tracer,
6254 &tracing_max_latency, &tracing_max_lat_fops); 6360 &tracing_max_latency, &tracing_max_lat_fops);
@@ -6494,11 +6600,16 @@ __init static int tracer_alloc_buffers(void)
6494 6600
6495 raw_spin_lock_init(&global_trace.start_lock); 6601 raw_spin_lock_init(&global_trace.start_lock);
6496 6602
6603 /* Used for event triggers */
6604 temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
6605 if (!temp_buffer)
6606 goto out_free_cpumask;
6607
6497 /* TODO: make the number of buffers hot pluggable with CPUS */ 6608 /* TODO: make the number of buffers hot pluggable with CPUS */
6498 if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { 6609 if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
6499 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 6610 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
6500 WARN_ON(1); 6611 WARN_ON(1);
6501 goto out_free_cpumask; 6612 goto out_free_temp_buffer;
6502 } 6613 }
6503 6614
6504 if (global_trace.buffer_disabled) 6615 if (global_trace.buffer_disabled)
@@ -6506,6 +6617,13 @@ __init static int tracer_alloc_buffers(void)
6506 6617
6507 trace_init_cmdlines(); 6618 trace_init_cmdlines();
6508 6619
6620 if (trace_boot_clock) {
6621 ret = tracing_set_clock(&global_trace, trace_boot_clock);
6622 if (ret < 0)
6623 pr_warning("Trace clock %s not defined, going back to default\n",
6624 trace_boot_clock);
6625 }
6626
6509 /* 6627 /*
6510 * register_tracer() might reference current_trace, so it 6628 * register_tracer() might reference current_trace, so it
6511 * needs to be set before we register anything. This is 6629 * needs to be set before we register anything. This is
@@ -6540,6 +6658,8 @@ __init static int tracer_alloc_buffers(void)
6540 6658
6541 return 0; 6659 return 0;
6542 6660
6661out_free_temp_buffer:
6662 ring_buffer_free(temp_buffer);
6543out_free_cpumask: 6663out_free_cpumask:
6544 free_percpu(global_trace.trace_buffer.data); 6664 free_percpu(global_trace.trace_buffer.data);
6545#ifdef CONFIG_TRACER_MAX_TRACE 6665#ifdef CONFIG_TRACER_MAX_TRACE
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 02b592f2d4b7..ffc314b7e92b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -210,6 +210,11 @@ struct trace_array {
210 struct list_head events; 210 struct list_head events;
211 cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ 211 cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
212 int ref; 212 int ref;
213#ifdef CONFIG_FUNCTION_TRACER
214 struct ftrace_ops *ops;
215 /* function tracing enabled */
216 int function_enabled;
217#endif
213}; 218};
214 219
215enum { 220enum {
@@ -355,14 +360,16 @@ struct tracer {
355 void (*print_header)(struct seq_file *m); 360 void (*print_header)(struct seq_file *m);
356 enum print_line_t (*print_line)(struct trace_iterator *iter); 361 enum print_line_t (*print_line)(struct trace_iterator *iter);
357 /* If you handled the flag setting, return 0 */ 362 /* If you handled the flag setting, return 0 */
358 int (*set_flag)(u32 old_flags, u32 bit, int set); 363 int (*set_flag)(struct trace_array *tr,
364 u32 old_flags, u32 bit, int set);
359 /* Return 0 if OK with change, else return non-zero */ 365 /* Return 0 if OK with change, else return non-zero */
360 int (*flag_changed)(struct tracer *tracer, 366 int (*flag_changed)(struct trace_array *tr,
361 u32 mask, int set); 367 u32 mask, int set);
362 struct tracer *next; 368 struct tracer *next;
363 struct tracer_flags *flags; 369 struct tracer_flags *flags;
370 int enabled;
364 bool print_max; 371 bool print_max;
365 bool enabled; 372 bool allow_instances;
366#ifdef CONFIG_TRACER_MAX_TRACE 373#ifdef CONFIG_TRACER_MAX_TRACE
367 bool use_max_tr; 374 bool use_max_tr;
368#endif 375#endif
@@ -812,13 +819,36 @@ static inline int ftrace_trace_task(struct task_struct *task)
812 return test_tsk_trace_trace(task); 819 return test_tsk_trace_trace(task);
813} 820}
814extern int ftrace_is_dead(void); 821extern int ftrace_is_dead(void);
822int ftrace_create_function_files(struct trace_array *tr,
823 struct dentry *parent);
824void ftrace_destroy_function_files(struct trace_array *tr);
815#else 825#else
816static inline int ftrace_trace_task(struct task_struct *task) 826static inline int ftrace_trace_task(struct task_struct *task)
817{ 827{
818 return 1; 828 return 1;
819} 829}
820static inline int ftrace_is_dead(void) { return 0; } 830static inline int ftrace_is_dead(void) { return 0; }
821#endif 831static inline int
832ftrace_create_function_files(struct trace_array *tr,
833 struct dentry *parent)
834{
835 return 0;
836}
837static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
838#endif /* CONFIG_FUNCTION_TRACER */
839
840#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
841void ftrace_create_filter_files(struct ftrace_ops *ops,
842 struct dentry *parent);
843void ftrace_destroy_filter_files(struct ftrace_ops *ops);
844#else
845/*
846 * The ops parameter passed in is usually undefined.
847 * This must be a macro.
848 */
849#define ftrace_create_filter_files(ops, parent) do { } while (0)
850#define ftrace_destroy_filter_files(ops) do { } while (0)
851#endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */
822 852
823int ftrace_event_is_function(struct ftrace_event_call *call); 853int ftrace_event_is_function(struct ftrace_event_call *call);
824 854
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index e854f420e033..c894614de14d 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -31,9 +31,25 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
31 } 31 }
32 32
33 /* The ftrace function trace is allowed only for root. */ 33 /* The ftrace function trace is allowed only for root. */
34 if (ftrace_event_is_function(tp_event) && 34 if (ftrace_event_is_function(tp_event)) {
35 perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) 35 if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
36 return -EPERM; 36 return -EPERM;
37
38 /*
39 * We don't allow user space callchains for function trace
40 * event, due to issues with page faults while tracing page
41 * fault handler and its overall trickiness nature.
42 */
43 if (!p_event->attr.exclude_callchain_user)
44 return -EINVAL;
45
46 /*
47 * Same reason to disable user stack dump as for user space
48 * callchains above.
49 */
50 if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER)
51 return -EINVAL;
52 }
37 53
38 /* No tracing, just counting, so no obvious leak */ 54 /* No tracing, just counting, so no obvious leak */
39 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) 55 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f3989ceb5cd5..83a4378dc5e0 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,12 +27,6 @@
27 27
28DEFINE_MUTEX(event_mutex); 28DEFINE_MUTEX(event_mutex);
29 29
30DEFINE_MUTEX(event_storage_mutex);
31EXPORT_SYMBOL_GPL(event_storage_mutex);
32
33char event_storage[EVENT_STORAGE_SIZE];
34EXPORT_SYMBOL_GPL(event_storage);
35
36LIST_HEAD(ftrace_events); 30LIST_HEAD(ftrace_events);
37static LIST_HEAD(ftrace_common_fields); 31static LIST_HEAD(ftrace_common_fields);
38 32
@@ -194,6 +188,36 @@ int trace_event_raw_init(struct ftrace_event_call *call)
194} 188}
195EXPORT_SYMBOL_GPL(trace_event_raw_init); 189EXPORT_SYMBOL_GPL(trace_event_raw_init);
196 190
191void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
192 struct ftrace_event_file *ftrace_file,
193 unsigned long len)
194{
195 struct ftrace_event_call *event_call = ftrace_file->event_call;
196
197 local_save_flags(fbuffer->flags);
198 fbuffer->pc = preempt_count();
199 fbuffer->ftrace_file = ftrace_file;
200
201 fbuffer->event =
202 trace_event_buffer_lock_reserve(&fbuffer->buffer, ftrace_file,
203 event_call->event.type, len,
204 fbuffer->flags, fbuffer->pc);
205 if (!fbuffer->event)
206 return NULL;
207
208 fbuffer->entry = ring_buffer_event_data(fbuffer->event);
209 return fbuffer->entry;
210}
211EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve);
212
213void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
214{
215 event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer,
216 fbuffer->event, fbuffer->entry,
217 fbuffer->flags, fbuffer->pc);
218}
219EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit);
220
197int ftrace_event_reg(struct ftrace_event_call *call, 221int ftrace_event_reg(struct ftrace_event_call *call,
198 enum trace_reg type, void *data) 222 enum trace_reg type, void *data)
199{ 223{
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 7c3e3e72e2b6..ee0a5098ac43 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -95,15 +95,12 @@ static void __always_unused ____ftrace_check_##name(void) \
95#undef __array 95#undef __array
96#define __array(type, item, len) \ 96#define __array(type, item, len) \
97 do { \ 97 do { \
98 char *type_str = #type"["__stringify(len)"]"; \
98 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ 99 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
99 mutex_lock(&event_storage_mutex); \ 100 ret = trace_define_field(event_call, type_str, #item, \
100 snprintf(event_storage, sizeof(event_storage), \
101 "%s[%d]", #type, len); \
102 ret = trace_define_field(event_call, event_storage, #item, \
103 offsetof(typeof(field), item), \ 101 offsetof(typeof(field), item), \
104 sizeof(field.item), \ 102 sizeof(field.item), \
105 is_signed_type(type), filter_type); \ 103 is_signed_type(type), filter_type); \
106 mutex_unlock(&event_storage_mutex); \
107 if (ret) \ 104 if (ret) \
108 return ret; \ 105 return ret; \
109 } while (0); 106 } while (0);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 38fe1483c508..5b781d2be383 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -13,32 +13,106 @@
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15#include <linux/ftrace.h> 15#include <linux/ftrace.h>
16#include <linux/slab.h>
16#include <linux/fs.h> 17#include <linux/fs.h>
17 18
18#include "trace.h" 19#include "trace.h"
19 20
20/* function tracing enabled */ 21static void tracing_start_function_trace(struct trace_array *tr);
21static int ftrace_function_enabled; 22static void tracing_stop_function_trace(struct trace_array *tr);
23static void
24function_trace_call(unsigned long ip, unsigned long parent_ip,
25 struct ftrace_ops *op, struct pt_regs *pt_regs);
26static void
27function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
28 struct ftrace_ops *op, struct pt_regs *pt_regs);
29static struct ftrace_ops trace_ops;
30static struct ftrace_ops trace_stack_ops;
31static struct tracer_flags func_flags;
32
33/* Our option */
34enum {
35 TRACE_FUNC_OPT_STACK = 0x1,
36};
37
38static int allocate_ftrace_ops(struct trace_array *tr)
39{
40 struct ftrace_ops *ops;
41
42 ops = kzalloc(sizeof(*ops), GFP_KERNEL);
43 if (!ops)
44 return -ENOMEM;
22 45
23static struct trace_array *func_trace; 46 /* Currently only the non stack verision is supported */
47 ops->func = function_trace_call;
48 ops->flags = FTRACE_OPS_FL_RECURSION_SAFE;
49
50 tr->ops = ops;
51 ops->private = tr;
52 return 0;
53}
54
55
56int ftrace_create_function_files(struct trace_array *tr,
57 struct dentry *parent)
58{
59 int ret;
60
61 /* The top level array uses the "global_ops". */
62 if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) {
63 ret = allocate_ftrace_ops(tr);
64 if (ret)
65 return ret;
66 }
67
68 ftrace_create_filter_files(tr->ops, parent);
69
70 return 0;
71}
24 72
25static void tracing_start_function_trace(void); 73void ftrace_destroy_function_files(struct trace_array *tr)
26static void tracing_stop_function_trace(void); 74{
75 ftrace_destroy_filter_files(tr->ops);
76 kfree(tr->ops);
77 tr->ops = NULL;
78}
27 79
28static int function_trace_init(struct trace_array *tr) 80static int function_trace_init(struct trace_array *tr)
29{ 81{
30 func_trace = tr; 82 struct ftrace_ops *ops;
83
84 if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
85 /* There's only one global tr */
86 if (!trace_ops.private) {
87 trace_ops.private = tr;
88 trace_stack_ops.private = tr;
89 }
90
91 if (func_flags.val & TRACE_FUNC_OPT_STACK)
92 ops = &trace_stack_ops;
93 else
94 ops = &trace_ops;
95 tr->ops = ops;
96 } else if (!tr->ops) {
97 /*
98 * Instance trace_arrays get their ops allocated
99 * at instance creation. Unless it failed
100 * the allocation.
101 */
102 return -ENOMEM;
103 }
104
31 tr->trace_buffer.cpu = get_cpu(); 105 tr->trace_buffer.cpu = get_cpu();
32 put_cpu(); 106 put_cpu();
33 107
34 tracing_start_cmdline_record(); 108 tracing_start_cmdline_record();
35 tracing_start_function_trace(); 109 tracing_start_function_trace(tr);
36 return 0; 110 return 0;
37} 111}
38 112
39static void function_trace_reset(struct trace_array *tr) 113static void function_trace_reset(struct trace_array *tr)
40{ 114{
41 tracing_stop_function_trace(); 115 tracing_stop_function_trace(tr);
42 tracing_stop_cmdline_record(); 116 tracing_stop_cmdline_record();
43} 117}
44 118
@@ -47,25 +121,18 @@ static void function_trace_start(struct trace_array *tr)
47 tracing_reset_online_cpus(&tr->trace_buffer); 121 tracing_reset_online_cpus(&tr->trace_buffer);
48} 122}
49 123
50/* Our option */
51enum {
52 TRACE_FUNC_OPT_STACK = 0x1,
53};
54
55static struct tracer_flags func_flags;
56
57static void 124static void
58function_trace_call(unsigned long ip, unsigned long parent_ip, 125function_trace_call(unsigned long ip, unsigned long parent_ip,
59 struct ftrace_ops *op, struct pt_regs *pt_regs) 126 struct ftrace_ops *op, struct pt_regs *pt_regs)
60{ 127{
61 struct trace_array *tr = func_trace; 128 struct trace_array *tr = op->private;
62 struct trace_array_cpu *data; 129 struct trace_array_cpu *data;
63 unsigned long flags; 130 unsigned long flags;
64 int bit; 131 int bit;
65 int cpu; 132 int cpu;
66 int pc; 133 int pc;
67 134
68 if (unlikely(!ftrace_function_enabled)) 135 if (unlikely(!tr->function_enabled))
69 return; 136 return;
70 137
71 pc = preempt_count(); 138 pc = preempt_count();
@@ -91,14 +158,14 @@ static void
91function_stack_trace_call(unsigned long ip, unsigned long parent_ip, 158function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
92 struct ftrace_ops *op, struct pt_regs *pt_regs) 159 struct ftrace_ops *op, struct pt_regs *pt_regs)
93{ 160{
94 struct trace_array *tr = func_trace; 161 struct trace_array *tr = op->private;
95 struct trace_array_cpu *data; 162 struct trace_array_cpu *data;
96 unsigned long flags; 163 unsigned long flags;
97 long disabled; 164 long disabled;
98 int cpu; 165 int cpu;
99 int pc; 166 int pc;
100 167
101 if (unlikely(!ftrace_function_enabled)) 168 if (unlikely(!tr->function_enabled))
102 return; 169 return;
103 170
104 /* 171 /*
@@ -128,7 +195,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
128 local_irq_restore(flags); 195 local_irq_restore(flags);
129} 196}
130 197
131
132static struct ftrace_ops trace_ops __read_mostly = 198static struct ftrace_ops trace_ops __read_mostly =
133{ 199{
134 .func = function_trace_call, 200 .func = function_trace_call,
@@ -153,29 +219,21 @@ static struct tracer_flags func_flags = {
153 .opts = func_opts 219 .opts = func_opts
154}; 220};
155 221
156static void tracing_start_function_trace(void) 222static void tracing_start_function_trace(struct trace_array *tr)
157{ 223{
158 ftrace_function_enabled = 0; 224 tr->function_enabled = 0;
159 225 register_ftrace_function(tr->ops);
160 if (func_flags.val & TRACE_FUNC_OPT_STACK) 226 tr->function_enabled = 1;
161 register_ftrace_function(&trace_stack_ops);
162 else
163 register_ftrace_function(&trace_ops);
164
165 ftrace_function_enabled = 1;
166} 227}
167 228
168static void tracing_stop_function_trace(void) 229static void tracing_stop_function_trace(struct trace_array *tr)
169{ 230{
170 ftrace_function_enabled = 0; 231 tr->function_enabled = 0;
171 232 unregister_ftrace_function(tr->ops);
172 if (func_flags.val & TRACE_FUNC_OPT_STACK)
173 unregister_ftrace_function(&trace_stack_ops);
174 else
175 unregister_ftrace_function(&trace_ops);
176} 233}
177 234
178static int func_set_flag(u32 old_flags, u32 bit, int set) 235static int
236func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
179{ 237{
180 switch (bit) { 238 switch (bit) {
181 case TRACE_FUNC_OPT_STACK: 239 case TRACE_FUNC_OPT_STACK:
@@ -183,12 +241,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
183 if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) 241 if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
184 break; 242 break;
185 243
244 unregister_ftrace_function(tr->ops);
245
186 if (set) { 246 if (set) {
187 unregister_ftrace_function(&trace_ops); 247 tr->ops = &trace_stack_ops;
188 register_ftrace_function(&trace_stack_ops); 248 register_ftrace_function(tr->ops);
189 } else { 249 } else {
190 unregister_ftrace_function(&trace_stack_ops); 250 tr->ops = &trace_ops;
191 register_ftrace_function(&trace_ops); 251 register_ftrace_function(tr->ops);
192 } 252 }
193 253
194 break; 254 break;
@@ -208,6 +268,7 @@ static struct tracer function_trace __tracer_data =
208 .wait_pipe = poll_wait_pipe, 268 .wait_pipe = poll_wait_pipe,
209 .flags = &func_flags, 269 .flags = &func_flags,
210 .set_flag = func_set_flag, 270 .set_flag = func_set_flag,
271 .allow_instances = true,
211#ifdef CONFIG_FTRACE_SELFTEST 272#ifdef CONFIG_FTRACE_SELFTEST
212 .selftest = trace_selftest_startup_function, 273 .selftest = trace_selftest_startup_function,
213#endif 274#endif
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 0b99120d395c..deff11200261 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1476,7 +1476,8 @@ void graph_trace_close(struct trace_iterator *iter)
1476 } 1476 }
1477} 1477}
1478 1478
1479static int func_graph_set_flag(u32 old_flags, u32 bit, int set) 1479static int
1480func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
1480{ 1481{
1481 if (bit == TRACE_GRAPH_PRINT_IRQS) 1482 if (bit == TRACE_GRAPH_PRINT_IRQS)
1482 ftrace_graph_skip_irqs = !set; 1483 ftrace_graph_skip_irqs = !set;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2aefbee93a6d..8ff02cbb892f 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -160,7 +160,8 @@ static struct ftrace_ops trace_ops __read_mostly =
160#endif /* CONFIG_FUNCTION_TRACER */ 160#endif /* CONFIG_FUNCTION_TRACER */
161 161
162#ifdef CONFIG_FUNCTION_GRAPH_TRACER 162#ifdef CONFIG_FUNCTION_GRAPH_TRACER
163static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) 163static int
164irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
164{ 165{
165 int cpu; 166 int cpu;
166 167
@@ -266,7 +267,8 @@ __trace_function(struct trace_array *tr,
266#else 267#else
267#define __trace_function trace_function 268#define __trace_function trace_function
268 269
269static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) 270static int
271irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
270{ 272{
271 return -EINVAL; 273 return -EINVAL;
272} 274}
@@ -498,14 +500,14 @@ void trace_hardirqs_off(void)
498} 500}
499EXPORT_SYMBOL(trace_hardirqs_off); 501EXPORT_SYMBOL(trace_hardirqs_off);
500 502
501void trace_hardirqs_on_caller(unsigned long caller_addr) 503__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
502{ 504{
503 if (!preempt_trace() && irq_trace()) 505 if (!preempt_trace() && irq_trace())
504 stop_critical_timing(CALLER_ADDR0, caller_addr); 506 stop_critical_timing(CALLER_ADDR0, caller_addr);
505} 507}
506EXPORT_SYMBOL(trace_hardirqs_on_caller); 508EXPORT_SYMBOL(trace_hardirqs_on_caller);
507 509
508void trace_hardirqs_off_caller(unsigned long caller_addr) 510__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
509{ 511{
510 if (!preempt_trace() && irq_trace()) 512 if (!preempt_trace() && irq_trace())
511 start_critical_timing(CALLER_ADDR0, caller_addr); 513 start_critical_timing(CALLER_ADDR0, caller_addr);
@@ -570,8 +572,10 @@ static void irqsoff_function_set(int set)
570 unregister_irqsoff_function(is_graph()); 572 unregister_irqsoff_function(is_graph());
571} 573}
572 574
573static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set) 575static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
574{ 576{
577 struct tracer *tracer = tr->current_trace;
578
575 if (mask & TRACE_ITER_FUNCTION) 579 if (mask & TRACE_ITER_FUNCTION)
576 irqsoff_function_set(set); 580 irqsoff_function_set(set);
577 581
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index bdbae450c13e..d021d21dd150 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -35,11 +35,6 @@ struct trace_kprobe {
35 struct trace_probe tp; 35 struct trace_probe tp;
36}; 36};
37 37
38struct event_file_link {
39 struct ftrace_event_file *file;
40 struct list_head list;
41};
42
43#define SIZEOF_TRACE_KPROBE(n) \ 38#define SIZEOF_TRACE_KPROBE(n) \
44 (offsetof(struct trace_kprobe, tp.args) + \ 39 (offsetof(struct trace_kprobe, tp.args) + \
45 (sizeof(struct probe_arg) * (n))) 40 (sizeof(struct probe_arg) * (n)))
@@ -387,18 +382,6 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
387 return ret; 382 return ret;
388} 383}
389 384
390static struct event_file_link *
391find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
392{
393 struct event_file_link *link;
394
395 list_for_each_entry(link, &tp->files, list)
396 if (link->file == file)
397 return link;
398
399 return NULL;
400}
401
402/* 385/*
403 * Disable trace_probe 386 * Disable trace_probe
404 * if the file is NULL, disable "perf" handler, or disable "trace" handler. 387 * if the file is NULL, disable "perf" handler, or disable "trace" handler.
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 394f94417e2f..69a5cc94c01a 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -62,7 +62,7 @@ static void nop_trace_reset(struct trace_array *tr)
62 * If you don't implement it, then the flag setting will be 62 * If you don't implement it, then the flag setting will be
63 * automatically accepted. 63 * automatically accepted.
64 */ 64 */
65static int nop_set_flag(u32 old_flags, u32 bit, int set) 65static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
66{ 66{
67 /* 67 /*
68 * Note that you don't need to update nop_flags.val yourself. 68 * Note that you don't need to update nop_flags.val yourself.
@@ -96,6 +96,7 @@ struct tracer nop_trace __read_mostly =
96 .selftest = trace_selftest_startup_nop, 96 .selftest = trace_selftest_startup_nop,
97#endif 97#endif
98 .flags = &nop_flags, 98 .flags = &nop_flags,
99 .set_flag = nop_set_flag 99 .set_flag = nop_set_flag,
100 .allow_instances = true,
100}; 101};
101 102
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ed32284fbe32..ca0e79e2abaa 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -439,6 +439,37 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
439} 439}
440EXPORT_SYMBOL(ftrace_raw_output_prep); 440EXPORT_SYMBOL(ftrace_raw_output_prep);
441 441
442static int ftrace_output_raw(struct trace_iterator *iter, char *name,
443 char *fmt, va_list ap)
444{
445 struct trace_seq *s = &iter->seq;
446 int ret;
447
448 ret = trace_seq_printf(s, "%s: ", name);
449 if (!ret)
450 return TRACE_TYPE_PARTIAL_LINE;
451
452 ret = trace_seq_vprintf(s, fmt, ap);
453
454 if (!ret)
455 return TRACE_TYPE_PARTIAL_LINE;
456
457 return TRACE_TYPE_HANDLED;
458}
459
460int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
461{
462 va_list ap;
463 int ret;
464
465 va_start(ap, fmt);
466 ret = ftrace_output_raw(iter, name, fmt, ap);
467 va_end(ap);
468
469 return ret;
470}
471EXPORT_SYMBOL_GPL(ftrace_output_call);
472
442#ifdef CONFIG_KRETPROBES 473#ifdef CONFIG_KRETPROBES
443static inline const char *kretprobed(const char *name) 474static inline const char *kretprobed(const char *name)
444{ 475{
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index b73574a5f429..fb1ab5dfbd42 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -288,6 +288,11 @@ struct trace_probe {
288 struct probe_arg args[]; 288 struct probe_arg args[];
289}; 289};
290 290
291struct event_file_link {
292 struct ftrace_event_file *file;
293 struct list_head list;
294};
295
291static inline bool trace_probe_is_enabled(struct trace_probe *tp) 296static inline bool trace_probe_is_enabled(struct trace_probe *tp)
292{ 297{
293 return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); 298 return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
@@ -316,6 +321,18 @@ static inline int is_good_name(const char *name)
316 return 1; 321 return 1;
317} 322}
318 323
324static inline struct event_file_link *
325find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
326{
327 struct event_file_link *link;
328
329 list_for_each_entry(link, &tp->files, list)
330 if (link->file == file)
331 return link;
332
333 return NULL;
334}
335
319extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, 336extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
320 struct probe_arg *parg, bool is_return, bool is_kprobe); 337 struct probe_arg *parg, bool is_return, bool is_kprobe);
321 338
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 6e32635e5e57..e14da5e97a69 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -179,8 +179,10 @@ static void wakeup_function_set(int set)
179 unregister_wakeup_function(is_graph()); 179 unregister_wakeup_function(is_graph());
180} 180}
181 181
182static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set) 182static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
183{ 183{
184 struct tracer *tracer = tr->current_trace;
185
184 if (mask & TRACE_ITER_FUNCTION) 186 if (mask & TRACE_ITER_FUNCTION)
185 wakeup_function_set(set); 187 wakeup_function_set(set);
186 188
@@ -209,7 +211,8 @@ static void stop_func_tracer(int graph)
209} 211}
210 212
211#ifdef CONFIG_FUNCTION_GRAPH_TRACER 213#ifdef CONFIG_FUNCTION_GRAPH_TRACER
212static int wakeup_set_flag(u32 old_flags, u32 bit, int set) 214static int
215wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
213{ 216{
214 217
215 if (!(bit & TRACE_DISPLAY_GRAPH)) 218 if (!(bit & TRACE_DISPLAY_GRAPH))
@@ -311,7 +314,8 @@ __trace_function(struct trace_array *tr,
311#else 314#else
312#define __trace_function trace_function 315#define __trace_function trace_function
313 316
314static int wakeup_set_flag(u32 old_flags, u32 bit, int set) 317static int
318wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
315{ 319{
316 return -EINVAL; 320 return -EINVAL;
317} 321}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index e6be585cf06a..21b320e5d163 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,6 +13,7 @@
13#include <linux/sysctl.h> 13#include <linux/sysctl.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/magic.h>
16 17
17#include <asm/setup.h> 18#include <asm/setup.h>
18 19
@@ -144,6 +145,8 @@ check_stack(unsigned long ip, unsigned long *stack)
144 i++; 145 i++;
145 } 146 }
146 147
148 BUG_ON(current != &init_task &&
149 *(end_of_stack(current)) != STACK_END_MAGIC);
147 out: 150 out:
148 arch_spin_unlock(&max_stack_lock); 151 arch_spin_unlock(&max_stack_lock);
149 local_irq_restore(flags); 152 local_irq_restore(flags);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 79e52d93860b..e4473367e7a4 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -260,6 +260,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
260 goto error; 260 goto error;
261 261
262 INIT_LIST_HEAD(&tu->list); 262 INIT_LIST_HEAD(&tu->list);
263 INIT_LIST_HEAD(&tu->tp.files);
263 tu->consumer.handler = uprobe_dispatcher; 264 tu->consumer.handler = uprobe_dispatcher;
264 if (is_ret) 265 if (is_ret)
265 tu->consumer.ret_handler = uretprobe_dispatcher; 266 tu->consumer.ret_handler = uretprobe_dispatcher;
@@ -758,31 +759,32 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
758 mutex_unlock(&ucb->mutex); 759 mutex_unlock(&ucb->mutex);
759} 760}
760 761
761static void uprobe_trace_print(struct trace_uprobe *tu, 762static void __uprobe_trace_func(struct trace_uprobe *tu,
762 unsigned long func, struct pt_regs *regs) 763 unsigned long func, struct pt_regs *regs,
764 struct uprobe_cpu_buffer *ucb, int dsize,
765 struct ftrace_event_file *ftrace_file)
763{ 766{
764 struct uprobe_trace_entry_head *entry; 767 struct uprobe_trace_entry_head *entry;
765 struct ring_buffer_event *event; 768 struct ring_buffer_event *event;
766 struct ring_buffer *buffer; 769 struct ring_buffer *buffer;
767 struct uprobe_cpu_buffer *ucb;
768 void *data; 770 void *data;
769 int size, dsize, esize; 771 int size, esize;
770 struct ftrace_event_call *call = &tu->tp.call; 772 struct ftrace_event_call *call = &tu->tp.call;
771 773
772 dsize = __get_data_size(&tu->tp, regs); 774 WARN_ON(call != ftrace_file->event_call);
773 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
774 775
775 if (WARN_ON_ONCE(!uprobe_cpu_buffer || tu->tp.size + dsize > PAGE_SIZE)) 776 if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
776 return; 777 return;
777 778
778 ucb = uprobe_buffer_get(); 779 if (ftrace_trigger_soft_disabled(ftrace_file))
779 store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); 780 return;
780 781
782 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
781 size = esize + tu->tp.size + dsize; 783 size = esize + tu->tp.size + dsize;
782 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 784 event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
783 size, 0, 0); 785 call->event.type, size, 0, 0);
784 if (!event) 786 if (!event)
785 goto out; 787 return;
786 788
787 entry = ring_buffer_event_data(event); 789 entry = ring_buffer_event_data(event);
788 if (is_ret_probe(tu)) { 790 if (is_ret_probe(tu)) {
@@ -796,25 +798,36 @@ static void uprobe_trace_print(struct trace_uprobe *tu,
796 798
797 memcpy(data, ucb->buf, tu->tp.size + dsize); 799 memcpy(data, ucb->buf, tu->tp.size + dsize);
798 800
799 if (!call_filter_check_discard(call, entry, buffer, event)) 801 event_trigger_unlock_commit(ftrace_file, buffer, event, entry, 0, 0);
800 trace_buffer_unlock_commit(buffer, event, 0, 0);
801
802out:
803 uprobe_buffer_put(ucb);
804} 802}
805 803
806/* uprobe handler */ 804/* uprobe handler */
807static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) 805static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
806 struct uprobe_cpu_buffer *ucb, int dsize)
808{ 807{
809 if (!is_ret_probe(tu)) 808 struct event_file_link *link;
810 uprobe_trace_print(tu, 0, regs); 809
810 if (is_ret_probe(tu))
811 return 0;
812
813 rcu_read_lock();
814 list_for_each_entry_rcu(link, &tu->tp.files, list)
815 __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file);
816 rcu_read_unlock();
817
811 return 0; 818 return 0;
812} 819}
813 820
814static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, 821static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
815 struct pt_regs *regs) 822 struct pt_regs *regs,
823 struct uprobe_cpu_buffer *ucb, int dsize)
816{ 824{
817 uprobe_trace_print(tu, func, regs); 825 struct event_file_link *link;
826
827 rcu_read_lock();
828 list_for_each_entry_rcu(link, &tu->tp.files, list)
829 __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file);
830 rcu_read_unlock();
818} 831}
819 832
820/* Event entry printers */ 833/* Event entry printers */
@@ -861,12 +874,24 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self,
861 struct mm_struct *mm); 874 struct mm_struct *mm);
862 875
863static int 876static int
864probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter) 877probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
878 filter_func_t filter)
865{ 879{
866 int ret = 0; 880 bool enabled = trace_probe_is_enabled(&tu->tp);
881 struct event_file_link *link = NULL;
882 int ret;
883
884 if (file) {
885 link = kmalloc(sizeof(*link), GFP_KERNEL);
886 if (!link)
887 return -ENOMEM;
867 888
868 if (trace_probe_is_enabled(&tu->tp)) 889 link->file = file;
869 return -EINTR; 890 list_add_tail_rcu(&link->list, &tu->tp.files);
891
892 tu->tp.flags |= TP_FLAG_TRACE;
893 } else
894 tu->tp.flags |= TP_FLAG_PROFILE;
870 895
871 ret = uprobe_buffer_enable(); 896 ret = uprobe_buffer_enable();
872 if (ret < 0) 897 if (ret < 0)
@@ -874,24 +899,49 @@ probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
874 899
875 WARN_ON(!uprobe_filter_is_empty(&tu->filter)); 900 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
876 901
877 tu->tp.flags |= flag; 902 if (enabled)
903 return 0;
904
878 tu->consumer.filter = filter; 905 tu->consumer.filter = filter;
879 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); 906 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
880 if (ret) 907 if (ret) {
881 tu->tp.flags &= ~flag; 908 if (file) {
909 list_del(&link->list);
910 kfree(link);
911 tu->tp.flags &= ~TP_FLAG_TRACE;
912 } else
913 tu->tp.flags &= ~TP_FLAG_PROFILE;
914 }
882 915
883 return ret; 916 return ret;
884} 917}
885 918
886static void probe_event_disable(struct trace_uprobe *tu, int flag) 919static void
920probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file)
887{ 921{
888 if (!trace_probe_is_enabled(&tu->tp)) 922 if (!trace_probe_is_enabled(&tu->tp))
889 return; 923 return;
890 924
925 if (file) {
926 struct event_file_link *link;
927
928 link = find_event_file_link(&tu->tp, file);
929 if (!link)
930 return;
931
932 list_del_rcu(&link->list);
933 /* synchronize with u{,ret}probe_trace_func */
934 synchronize_sched();
935 kfree(link);
936
937 if (!list_empty(&tu->tp.files))
938 return;
939 }
940
891 WARN_ON(!uprobe_filter_is_empty(&tu->filter)); 941 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
892 942
893 uprobe_unregister(tu->inode, tu->offset, &tu->consumer); 943 uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
894 tu->tp.flags &= ~flag; 944 tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;
895 945
896 uprobe_buffer_disable(); 946 uprobe_buffer_disable();
897} 947}
@@ -1014,31 +1064,24 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
1014 return ret; 1064 return ret;
1015} 1065}
1016 1066
1017static void uprobe_perf_print(struct trace_uprobe *tu, 1067static void __uprobe_perf_func(struct trace_uprobe *tu,
1018 unsigned long func, struct pt_regs *regs) 1068 unsigned long func, struct pt_regs *regs,
1069 struct uprobe_cpu_buffer *ucb, int dsize)
1019{ 1070{
1020 struct ftrace_event_call *call = &tu->tp.call; 1071 struct ftrace_event_call *call = &tu->tp.call;
1021 struct uprobe_trace_entry_head *entry; 1072 struct uprobe_trace_entry_head *entry;
1022 struct hlist_head *head; 1073 struct hlist_head *head;
1023 struct uprobe_cpu_buffer *ucb;
1024 void *data; 1074 void *data;
1025 int size, dsize, esize; 1075 int size, esize;
1026 int rctx; 1076 int rctx;
1027 1077
1028 dsize = __get_data_size(&tu->tp, regs);
1029 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); 1078 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
1030 1079
1031 if (WARN_ON_ONCE(!uprobe_cpu_buffer))
1032 return;
1033
1034 size = esize + tu->tp.size + dsize; 1080 size = esize + tu->tp.size + dsize;
1035 size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); 1081 size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32);
1036 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) 1082 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
1037 return; 1083 return;
1038 1084
1039 ucb = uprobe_buffer_get();
1040 store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
1041
1042 preempt_disable(); 1085 preempt_disable();
1043 head = this_cpu_ptr(call->perf_events); 1086 head = this_cpu_ptr(call->perf_events);
1044 if (hlist_empty(head)) 1087 if (hlist_empty(head))
@@ -1068,46 +1111,49 @@ static void uprobe_perf_print(struct trace_uprobe *tu,
1068 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); 1111 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
1069 out: 1112 out:
1070 preempt_enable(); 1113 preempt_enable();
1071 uprobe_buffer_put(ucb);
1072} 1114}
1073 1115
1074/* uprobe profile handler */ 1116/* uprobe profile handler */
1075static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) 1117static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs,
1118 struct uprobe_cpu_buffer *ucb, int dsize)
1076{ 1119{
1077 if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) 1120 if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
1078 return UPROBE_HANDLER_REMOVE; 1121 return UPROBE_HANDLER_REMOVE;
1079 1122
1080 if (!is_ret_probe(tu)) 1123 if (!is_ret_probe(tu))
1081 uprobe_perf_print(tu, 0, regs); 1124 __uprobe_perf_func(tu, 0, regs, ucb, dsize);
1082 return 0; 1125 return 0;
1083} 1126}
1084 1127
1085static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, 1128static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
1086 struct pt_regs *regs) 1129 struct pt_regs *regs,
1130 struct uprobe_cpu_buffer *ucb, int dsize)
1087{ 1131{
1088 uprobe_perf_print(tu, func, regs); 1132 __uprobe_perf_func(tu, func, regs, ucb, dsize);
1089} 1133}
1090#endif /* CONFIG_PERF_EVENTS */ 1134#endif /* CONFIG_PERF_EVENTS */
1091 1135
1092static 1136static int
1093int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) 1137trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
1138 void *data)
1094{ 1139{
1095 struct trace_uprobe *tu = event->data; 1140 struct trace_uprobe *tu = event->data;
1141 struct ftrace_event_file *file = data;
1096 1142
1097 switch (type) { 1143 switch (type) {
1098 case TRACE_REG_REGISTER: 1144 case TRACE_REG_REGISTER:
1099 return probe_event_enable(tu, TP_FLAG_TRACE, NULL); 1145 return probe_event_enable(tu, file, NULL);
1100 1146
1101 case TRACE_REG_UNREGISTER: 1147 case TRACE_REG_UNREGISTER:
1102 probe_event_disable(tu, TP_FLAG_TRACE); 1148 probe_event_disable(tu, file);
1103 return 0; 1149 return 0;
1104 1150
1105#ifdef CONFIG_PERF_EVENTS 1151#ifdef CONFIG_PERF_EVENTS
1106 case TRACE_REG_PERF_REGISTER: 1152 case TRACE_REG_PERF_REGISTER:
1107 return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter); 1153 return probe_event_enable(tu, NULL, uprobe_perf_filter);
1108 1154
1109 case TRACE_REG_PERF_UNREGISTER: 1155 case TRACE_REG_PERF_UNREGISTER:
1110 probe_event_disable(tu, TP_FLAG_PROFILE); 1156 probe_event_disable(tu, NULL);
1111 return 0; 1157 return 0;
1112 1158
1113 case TRACE_REG_PERF_OPEN: 1159 case TRACE_REG_PERF_OPEN:
@@ -1127,8 +1173,11 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
1127{ 1173{
1128 struct trace_uprobe *tu; 1174 struct trace_uprobe *tu;
1129 struct uprobe_dispatch_data udd; 1175 struct uprobe_dispatch_data udd;
1176 struct uprobe_cpu_buffer *ucb;
1177 int dsize, esize;
1130 int ret = 0; 1178 int ret = 0;
1131 1179
1180
1132 tu = container_of(con, struct trace_uprobe, consumer); 1181 tu = container_of(con, struct trace_uprobe, consumer);
1133 tu->nhit++; 1182 tu->nhit++;
1134 1183
@@ -1137,13 +1186,29 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
1137 1186
1138 current->utask->vaddr = (unsigned long) &udd; 1187 current->utask->vaddr = (unsigned long) &udd;
1139 1188
1189#ifdef CONFIG_PERF_EVENTS
1190 if ((tu->tp.flags & TP_FLAG_TRACE) == 0 &&
1191 !uprobe_perf_filter(&tu->consumer, 0, current->mm))
1192 return UPROBE_HANDLER_REMOVE;
1193#endif
1194
1195 if (WARN_ON_ONCE(!uprobe_cpu_buffer))
1196 return 0;
1197
1198 dsize = __get_data_size(&tu->tp, regs);
1199 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
1200
1201 ucb = uprobe_buffer_get();
1202 store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
1203
1140 if (tu->tp.flags & TP_FLAG_TRACE) 1204 if (tu->tp.flags & TP_FLAG_TRACE)
1141 ret |= uprobe_trace_func(tu, regs); 1205 ret |= uprobe_trace_func(tu, regs, ucb, dsize);
1142 1206
1143#ifdef CONFIG_PERF_EVENTS 1207#ifdef CONFIG_PERF_EVENTS
1144 if (tu->tp.flags & TP_FLAG_PROFILE) 1208 if (tu->tp.flags & TP_FLAG_PROFILE)
1145 ret |= uprobe_perf_func(tu, regs); 1209 ret |= uprobe_perf_func(tu, regs, ucb, dsize);
1146#endif 1210#endif
1211 uprobe_buffer_put(ucb);
1147 return ret; 1212 return ret;
1148} 1213}
1149 1214
@@ -1152,6 +1217,8 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
1152{ 1217{
1153 struct trace_uprobe *tu; 1218 struct trace_uprobe *tu;
1154 struct uprobe_dispatch_data udd; 1219 struct uprobe_dispatch_data udd;
1220 struct uprobe_cpu_buffer *ucb;
1221 int dsize, esize;
1155 1222
1156 tu = container_of(con, struct trace_uprobe, consumer); 1223 tu = container_of(con, struct trace_uprobe, consumer);
1157 1224
@@ -1160,13 +1227,23 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
1160 1227
1161 current->utask->vaddr = (unsigned long) &udd; 1228 current->utask->vaddr = (unsigned long) &udd;
1162 1229
1230 if (WARN_ON_ONCE(!uprobe_cpu_buffer))
1231 return 0;
1232
1233 dsize = __get_data_size(&tu->tp, regs);
1234 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
1235
1236 ucb = uprobe_buffer_get();
1237 store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
1238
1163 if (tu->tp.flags & TP_FLAG_TRACE) 1239 if (tu->tp.flags & TP_FLAG_TRACE)
1164 uretprobe_trace_func(tu, func, regs); 1240 uretprobe_trace_func(tu, func, regs, ucb, dsize);
1165 1241
1166#ifdef CONFIG_PERF_EVENTS 1242#ifdef CONFIG_PERF_EVENTS
1167 if (tu->tp.flags & TP_FLAG_PROFILE) 1243 if (tu->tp.flags & TP_FLAG_PROFILE)
1168 uretprobe_perf_func(tu, func, regs); 1244 uretprobe_perf_func(tu, func, regs, ucb, dsize);
1169#endif 1245#endif
1246 uprobe_buffer_put(ucb);
1170 return 0; 1247 return 0;
1171} 1248}
1172 1249
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 031cc5655a51..fb0a38a26555 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -62,14 +62,12 @@ struct tracepoint_entry {
62 struct hlist_node hlist; 62 struct hlist_node hlist;
63 struct tracepoint_func *funcs; 63 struct tracepoint_func *funcs;
64 int refcount; /* Number of times armed. 0 if disarmed. */ 64 int refcount; /* Number of times armed. 0 if disarmed. */
65 int enabled; /* Tracepoint enabled */
65 char name[0]; 66 char name[0];
66}; 67};
67 68
68struct tp_probes { 69struct tp_probes {
69 union { 70 struct rcu_head rcu;
70 struct rcu_head rcu;
71 struct list_head list;
72 } u;
73 struct tracepoint_func probes[0]; 71 struct tracepoint_func probes[0];
74}; 72};
75 73
@@ -82,7 +80,7 @@ static inline void *allocate_probes(int count)
82 80
83static void rcu_free_old_probes(struct rcu_head *head) 81static void rcu_free_old_probes(struct rcu_head *head)
84{ 82{
85 kfree(container_of(head, struct tp_probes, u.rcu)); 83 kfree(container_of(head, struct tp_probes, rcu));
86} 84}
87 85
88static inline void release_probes(struct tracepoint_func *old) 86static inline void release_probes(struct tracepoint_func *old)
@@ -90,7 +88,7 @@ static inline void release_probes(struct tracepoint_func *old)
90 if (old) { 88 if (old) {
91 struct tp_probes *tp_probes = container_of(old, 89 struct tp_probes *tp_probes = container_of(old,
92 struct tp_probes, probes[0]); 90 struct tp_probes, probes[0]);
93 call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes); 91 call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes);
94 } 92 }
95} 93}
96 94
@@ -237,6 +235,7 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
237 memcpy(&e->name[0], name, name_len); 235 memcpy(&e->name[0], name, name_len);
238 e->funcs = NULL; 236 e->funcs = NULL;
239 e->refcount = 0; 237 e->refcount = 0;
238 e->enabled = 0;
240 hlist_add_head(&e->hlist, head); 239 hlist_add_head(&e->hlist, head);
241 return e; 240 return e;
242} 241}
@@ -316,6 +315,7 @@ static void tracepoint_update_probe_range(struct tracepoint * const *begin,
316 if (mark_entry) { 315 if (mark_entry) {
317 set_tracepoint(&mark_entry, *iter, 316 set_tracepoint(&mark_entry, *iter,
318 !!mark_entry->refcount); 317 !!mark_entry->refcount);
318 mark_entry->enabled = !!mark_entry->refcount;
319 } else { 319 } else {
320 disable_tracepoint(*iter); 320 disable_tracepoint(*iter);
321 } 321 }
@@ -373,13 +373,26 @@ tracepoint_add_probe(const char *name, void *probe, void *data)
373 * tracepoint_probe_register - Connect a probe to a tracepoint 373 * tracepoint_probe_register - Connect a probe to a tracepoint
374 * @name: tracepoint name 374 * @name: tracepoint name
375 * @probe: probe handler 375 * @probe: probe handler
376 * @data: probe private data
377 *
378 * Returns:
379 * - 0 if the probe was successfully registered, and tracepoint
380 * callsites are currently loaded for that probe,
381 * - -ENODEV if the probe was successfully registered, but no tracepoint
382 * callsite is currently loaded for that probe,
383 * - other negative error value on error.
384 *
385 * When tracepoint_probe_register() returns either 0 or -ENODEV,
386 * parameters @name, @probe, and @data may be used by the tracepoint
387 * infrastructure until the probe is unregistered.
376 * 388 *
377 * Returns 0 if ok, error value on error.
378 * The probe address must at least be aligned on the architecture pointer size. 389 * The probe address must at least be aligned on the architecture pointer size.
379 */ 390 */
380int tracepoint_probe_register(const char *name, void *probe, void *data) 391int tracepoint_probe_register(const char *name, void *probe, void *data)
381{ 392{
382 struct tracepoint_func *old; 393 struct tracepoint_func *old;
394 struct tracepoint_entry *entry;
395 int ret = 0;
383 396
384 mutex_lock(&tracepoints_mutex); 397 mutex_lock(&tracepoints_mutex);
385 old = tracepoint_add_probe(name, probe, data); 398 old = tracepoint_add_probe(name, probe, data);
@@ -388,9 +401,13 @@ int tracepoint_probe_register(const char *name, void *probe, void *data)
388 return PTR_ERR(old); 401 return PTR_ERR(old);
389 } 402 }
390 tracepoint_update_probes(); /* may update entry */ 403 tracepoint_update_probes(); /* may update entry */
404 entry = get_tracepoint(name);
405 /* Make sure the entry was enabled */
406 if (!entry || !entry->enabled)
407 ret = -ENODEV;
391 mutex_unlock(&tracepoints_mutex); 408 mutex_unlock(&tracepoints_mutex);
392 release_probes(old); 409 release_probes(old);
393 return 0; 410 return ret;
394} 411}
395EXPORT_SYMBOL_GPL(tracepoint_probe_register); 412EXPORT_SYMBOL_GPL(tracepoint_probe_register);
396 413
@@ -415,6 +432,7 @@ tracepoint_remove_probe(const char *name, void *probe, void *data)
415 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint 432 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint
416 * @name: tracepoint name 433 * @name: tracepoint name
417 * @probe: probe function pointer 434 * @probe: probe function pointer
435 * @data: probe private data
418 * 436 *
419 * We do not need to call a synchronize_sched to make sure the probes have 437 * We do not need to call a synchronize_sched to make sure the probes have
420 * finished running before doing a module unload, because the module unload 438 * finished running before doing a module unload, because the module unload
@@ -438,213 +456,26 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data)
438} 456}
439EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); 457EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
440 458
441static LIST_HEAD(old_probes);
442static int need_update;
443
444static void tracepoint_add_old_probes(void *old)
445{
446 need_update = 1;
447 if (old) {
448 struct tp_probes *tp_probes = container_of(old,
449 struct tp_probes, probes[0]);
450 list_add(&tp_probes->u.list, &old_probes);
451 }
452}
453
454/**
455 * tracepoint_probe_register_noupdate - register a probe but not connect
456 * @name: tracepoint name
457 * @probe: probe handler
458 *
459 * caller must call tracepoint_probe_update_all()
460 */
461int tracepoint_probe_register_noupdate(const char *name, void *probe,
462 void *data)
463{
464 struct tracepoint_func *old;
465
466 mutex_lock(&tracepoints_mutex);
467 old = tracepoint_add_probe(name, probe, data);
468 if (IS_ERR(old)) {
469 mutex_unlock(&tracepoints_mutex);
470 return PTR_ERR(old);
471 }
472 tracepoint_add_old_probes(old);
473 mutex_unlock(&tracepoints_mutex);
474 return 0;
475}
476EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
477
478/**
479 * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect
480 * @name: tracepoint name
481 * @probe: probe function pointer
482 *
483 * caller must call tracepoint_probe_update_all()
484 */
485int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
486 void *data)
487{
488 struct tracepoint_func *old;
489
490 mutex_lock(&tracepoints_mutex);
491 old = tracepoint_remove_probe(name, probe, data);
492 if (IS_ERR(old)) {
493 mutex_unlock(&tracepoints_mutex);
494 return PTR_ERR(old);
495 }
496 tracepoint_add_old_probes(old);
497 mutex_unlock(&tracepoints_mutex);
498 return 0;
499}
500EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate);
501
502/**
503 * tracepoint_probe_update_all - update tracepoints
504 */
505void tracepoint_probe_update_all(void)
506{
507 LIST_HEAD(release_probes);
508 struct tp_probes *pos, *next;
509
510 mutex_lock(&tracepoints_mutex);
511 if (!need_update) {
512 mutex_unlock(&tracepoints_mutex);
513 return;
514 }
515 if (!list_empty(&old_probes))
516 list_replace_init(&old_probes, &release_probes);
517 need_update = 0;
518 tracepoint_update_probes();
519 mutex_unlock(&tracepoints_mutex);
520 list_for_each_entry_safe(pos, next, &release_probes, u.list) {
521 list_del(&pos->u.list);
522 call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
523 }
524}
525EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
526
527/**
528 * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
529 * @tracepoint: current tracepoints (in), next tracepoint (out)
530 * @begin: beginning of the range
531 * @end: end of the range
532 *
533 * Returns whether a next tracepoint has been found (1) or not (0).
534 * Will return the first tracepoint in the range if the input tracepoint is
535 * NULL.
536 */
537static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
538 struct tracepoint * const *begin, struct tracepoint * const *end)
539{
540 if (!*tracepoint && begin != end) {
541 *tracepoint = begin;
542 return 1;
543 }
544 if (*tracepoint >= begin && *tracepoint < end)
545 return 1;
546 return 0;
547}
548
549#ifdef CONFIG_MODULES
550static void tracepoint_get_iter(struct tracepoint_iter *iter)
551{
552 int found = 0;
553 struct tp_module *iter_mod;
554
555 /* Core kernel tracepoints */
556 if (!iter->module) {
557 found = tracepoint_get_iter_range(&iter->tracepoint,
558 __start___tracepoints_ptrs,
559 __stop___tracepoints_ptrs);
560 if (found)
561 goto end;
562 }
563 /* Tracepoints in modules */
564 mutex_lock(&tracepoints_mutex);
565 list_for_each_entry(iter_mod, &tracepoint_module_list, list) {
566 /*
567 * Sorted module list
568 */
569 if (iter_mod < iter->module)
570 continue;
571 else if (iter_mod > iter->module)
572 iter->tracepoint = NULL;
573 found = tracepoint_get_iter_range(&iter->tracepoint,
574 iter_mod->tracepoints_ptrs,
575 iter_mod->tracepoints_ptrs
576 + iter_mod->num_tracepoints);
577 if (found) {
578 iter->module = iter_mod;
579 break;
580 }
581 }
582 mutex_unlock(&tracepoints_mutex);
583end:
584 if (!found)
585 tracepoint_iter_reset(iter);
586}
587#else /* CONFIG_MODULES */
588static void tracepoint_get_iter(struct tracepoint_iter *iter)
589{
590 int found = 0;
591
592 /* Core kernel tracepoints */
593 found = tracepoint_get_iter_range(&iter->tracepoint,
594 __start___tracepoints_ptrs,
595 __stop___tracepoints_ptrs);
596 if (!found)
597 tracepoint_iter_reset(iter);
598}
599#endif /* CONFIG_MODULES */
600
601void tracepoint_iter_start(struct tracepoint_iter *iter)
602{
603 tracepoint_get_iter(iter);
604}
605EXPORT_SYMBOL_GPL(tracepoint_iter_start);
606
607void tracepoint_iter_next(struct tracepoint_iter *iter)
608{
609 iter->tracepoint++;
610 /*
611 * iter->tracepoint may be invalid because we blindly incremented it.
612 * Make sure it is valid by marshalling on the tracepoints, getting the
613 * tracepoints from following modules if necessary.
614 */
615 tracepoint_get_iter(iter);
616}
617EXPORT_SYMBOL_GPL(tracepoint_iter_next);
618
619void tracepoint_iter_stop(struct tracepoint_iter *iter)
620{
621}
622EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
623
624void tracepoint_iter_reset(struct tracepoint_iter *iter)
625{
626#ifdef CONFIG_MODULES
627 iter->module = NULL;
628#endif /* CONFIG_MODULES */
629 iter->tracepoint = NULL;
630}
631EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
632 459
633#ifdef CONFIG_MODULES 460#ifdef CONFIG_MODULES
634bool trace_module_has_bad_taint(struct module *mod) 461bool trace_module_has_bad_taint(struct module *mod)
635{ 462{
636 return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)); 463 return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) |
464 (1 << TAINT_UNSIGNED_MODULE));
637} 465}
638 466
639static int tracepoint_module_coming(struct module *mod) 467static int tracepoint_module_coming(struct module *mod)
640{ 468{
641 struct tp_module *tp_mod, *iter; 469 struct tp_module *tp_mod;
642 int ret = 0; 470 int ret = 0;
643 471
472 if (!mod->num_tracepoints)
473 return 0;
474
644 /* 475 /*
645 * We skip modules that taint the kernel, especially those with different 476 * We skip modules that taint the kernel, especially those with different
646 * module headers (for forced load), to make sure we don't cause a crash. 477 * module headers (for forced load), to make sure we don't cause a crash.
647 * Staging and out-of-tree GPL modules are fine. 478 * Staging, out-of-tree, and unsigned GPL modules are fine.
648 */ 479 */
649 if (trace_module_has_bad_taint(mod)) 480 if (trace_module_has_bad_taint(mod))
650 return 0; 481 return 0;
@@ -656,23 +487,7 @@ static int tracepoint_module_coming(struct module *mod)
656 } 487 }
657 tp_mod->num_tracepoints = mod->num_tracepoints; 488 tp_mod->num_tracepoints = mod->num_tracepoints;
658 tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs; 489 tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs;
659 490 list_add_tail(&tp_mod->list, &tracepoint_module_list);
660 /*
661 * tracepoint_module_list is kept sorted by struct module pointer
662 * address for iteration on tracepoints from a seq_file that can release
663 * the mutex between calls.
664 */
665 list_for_each_entry_reverse(iter, &tracepoint_module_list, list) {
666 BUG_ON(iter == tp_mod); /* Should never be in the list twice */
667 if (iter < tp_mod) {
668 /* We belong to the location right after iter. */
669 list_add(&tp_mod->list, &iter->list);
670 goto module_added;
671 }
672 }
673 /* We belong to the beginning of the list */
674 list_add(&tp_mod->list, &tracepoint_module_list);
675module_added:
676 tracepoint_update_probe_range(mod->tracepoints_ptrs, 491 tracepoint_update_probe_range(mod->tracepoints_ptrs,
677 mod->tracepoints_ptrs + mod->num_tracepoints); 492 mod->tracepoints_ptrs + mod->num_tracepoints);
678end: 493end:
@@ -684,6 +499,9 @@ static int tracepoint_module_going(struct module *mod)
684{ 499{
685 struct tp_module *pos; 500 struct tp_module *pos;
686 501
502 if (!mod->num_tracepoints)
503 return 0;
504
687 mutex_lock(&tracepoints_mutex); 505 mutex_lock(&tracepoints_mutex);
688 tracepoint_update_probe_range(mod->tracepoints_ptrs, 506 tracepoint_update_probe_range(mod->tracepoints_ptrs,
689 mod->tracepoints_ptrs + mod->num_tracepoints); 507 mod->tracepoints_ptrs + mod->num_tracepoints);
diff --git a/kernel/up.c b/kernel/up.c
index 509403e3fbc6..1760bf3d1463 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -22,16 +22,16 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
22} 22}
23EXPORT_SYMBOL(smp_call_function_single); 23EXPORT_SYMBOL(smp_call_function_single);
24 24
25void __smp_call_function_single(int cpu, struct call_single_data *csd, 25int smp_call_function_single_async(int cpu, struct call_single_data *csd)
26 int wait)
27{ 26{
28 unsigned long flags; 27 unsigned long flags;
29 28
30 local_irq_save(flags); 29 local_irq_save(flags);
31 csd->func(csd->info); 30 csd->func(csd->info);
32 local_irq_restore(flags); 31 local_irq_restore(flags);
32 return 0;
33} 33}
34EXPORT_SYMBOL(__smp_call_function_single); 34EXPORT_SYMBOL(smp_call_function_single_async);
35 35
36int on_each_cpu(smp_call_func_t func, void *info, int wait) 36int on_each_cpu(smp_call_func_t func, void *info, int wait)
37{ 37{
diff --git a/kernel/user.c b/kernel/user.c
index c006131beb77..294fc6a94168 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -222,5 +222,4 @@ static int __init uid_cache_init(void)
222 222
223 return 0; 223 return 0;
224} 224}
225 225subsys_initcall(uid_cache_init);
226module_init(uid_cache_init);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index dd06439b9c84..0d8f6023fd8d 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -902,4 +902,4 @@ static __init int user_namespaces_init(void)
902 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); 902 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
903 return 0; 903 return 0;
904} 904}
905module_init(user_namespaces_init); 905subsys_initcall(user_namespaces_init);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4431610f049a..e90089fd78e0 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -158,14 +158,14 @@ void touch_all_softlockup_watchdogs(void)
158#ifdef CONFIG_HARDLOCKUP_DETECTOR 158#ifdef CONFIG_HARDLOCKUP_DETECTOR
159void touch_nmi_watchdog(void) 159void touch_nmi_watchdog(void)
160{ 160{
161 if (watchdog_user_enabled) { 161 /*
162 unsigned cpu; 162 * Using __raw here because some code paths have
163 163 * preemption enabled. If preemption is enabled
164 for_each_present_cpu(cpu) { 164 * then interrupts should be enabled too, in which
165 if (per_cpu(watchdog_nmi_touch, cpu) != true) 165 * case we shouldn't have to worry about the watchdog
166 per_cpu(watchdog_nmi_touch, cpu) = true; 166 * going off.
167 } 167 */
168 } 168 __raw_get_cpu_var(watchdog_nmi_touch) = true;
169 touch_softlockup_watchdog(); 169 touch_softlockup_watchdog();
170} 170}
171EXPORT_SYMBOL(touch_nmi_watchdog); 171EXPORT_SYMBOL(touch_nmi_watchdog);
@@ -505,7 +505,6 @@ static void restart_watchdog_hrtimer(void *info)
505 505
506static void update_timers(int cpu) 506static void update_timers(int cpu)
507{ 507{
508 struct call_single_data data = {.func = restart_watchdog_hrtimer};
509 /* 508 /*
510 * Make sure that perf event counter will adopt to a new 509 * Make sure that perf event counter will adopt to a new
511 * sampling period. Updating the sampling period directly would 510 * sampling period. Updating the sampling period directly would
@@ -515,7 +514,7 @@ static void update_timers(int cpu)
515 * might be late already so we have to restart the timer as well. 514 * might be late already so we have to restart the timer as well.
516 */ 515 */
517 watchdog_nmi_disable(cpu); 516 watchdog_nmi_disable(cpu);
518 __smp_call_function_single(cpu, &data, 1); 517 smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1);
519 watchdog_nmi_enable(cpu); 518 watchdog_nmi_enable(cpu);
520} 519}
521 520
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 193e977a10ea..0ee63af30bd1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -516,6 +516,13 @@ void destroy_work_on_stack(struct work_struct *work)
516} 516}
517EXPORT_SYMBOL_GPL(destroy_work_on_stack); 517EXPORT_SYMBOL_GPL(destroy_work_on_stack);
518 518
519void destroy_delayed_work_on_stack(struct delayed_work *work)
520{
521 destroy_timer_on_stack(&work->timer);
522 debug_object_free(&work->work, &work_debug_descr);
523}
524EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);
525
519#else 526#else
520static inline void debug_work_activate(struct work_struct *work) { } 527static inline void debug_work_activate(struct work_struct *work) { }
521static inline void debug_work_deactivate(struct work_struct *work) { } 528static inline void debug_work_deactivate(struct work_struct *work) { }
@@ -3225,7 +3232,7 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
3225 return -ENOMEM; 3232 return -ENOMEM;
3226 3233
3227 if (sscanf(buf, "%d", &attrs->nice) == 1 && 3234 if (sscanf(buf, "%d", &attrs->nice) == 1 &&
3228 attrs->nice >= -20 && attrs->nice <= 19) 3235 attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
3229 ret = apply_workqueue_attrs(wq, attrs); 3236 ret = apply_workqueue_attrs(wq, attrs);
3230 else 3237 else
3231 ret = -EINVAL; 3238 ret = -EINVAL;