aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-08-15 14:11:45 -0400
committerSage Weil <sage@inktank.com>2013-08-15 14:11:45 -0400
commitee3e542fec6e69bc9fb668698889a37d93950ddf (patch)
treee74ee766a4764769ef1d3d45d266b4dea64101d3 /kernel
parentfe2a801b50c0bb8039d627e5ae1fec249d10ff39 (diff)
parentf1d6e17f540af37bb1891480143669ba7636c4cf (diff)
Merge remote-tracking branch 'linus/master' into testing
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks6
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit.h1
-rw-r--r--kernel/auditfilter.c8
-rw-r--r--kernel/auditsc.c12
-rw-r--r--kernel/cgroup.c1556
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c482
-rw-r--r--kernel/events/core.c308
-rw-r--r--kernel/events/hw_breakpoint.c191
-rw-r--r--kernel/exit.c15
-rw-r--r--kernel/fork.c78
-rw-r--r--kernel/freezer.c14
-rw-r--r--kernel/futex.c6
-rw-r--r--kernel/hrtimer.c41
-rw-r--r--kernel/irq/chip.c13
-rw-r--r--kernel/irq/generic-chip.c314
-rw-r--r--kernel/irq/irqdomain.c599
-rw-r--r--kernel/irq/manage.c17
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/kmod.c11
-rw-r--r--kernel/kprobes.c3
-rw-r--r--kernel/lockdep.c17
-rw-r--r--kernel/module.c77
-rw-r--r--kernel/mutex.c385
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/params.c2
-rw-r--r--kernel/pid.c14
-rw-r--r--kernel/posix-cpu-timers.c395
-rw-r--r--kernel/power/Kconfig21
-rw-r--r--kernel/power/autosleep.c3
-rw-r--r--kernel/power/main.c6
-rw-r--r--kernel/power/process.c37
-rw-r--r--kernel/power/qos.c14
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/suspend.c2
-rw-r--r--kernel/printk/Makefile2
-rw-r--r--kernel/printk/braille.c49
-rw-r--r--kernel/printk/braille.h48
-rw-r--r--kernel/printk/console_cmdline.h14
-rw-r--r--kernel/printk/printk.c (renamed from kernel/printk.c)187
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/ptrace.c60
-rw-r--r--kernel/rcupdate.c29
-rw-r--r--kernel/rcutiny.c21
-rw-r--r--kernel/rcutiny_plugin.h1009
-rw-r--r--kernel/rcutorture.c45
-rw-r--r--kernel/rcutree.c176
-rw-r--r--kernel/rcutree.h19
-rw-r--r--kernel/rcutree_plugin.h87
-rw-r--r--kernel/reboot.c419
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/rtmutex.c13
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c3
-rw-r--r--kernel/sched/core.c765
-rw-r--r--kernel/sched/cpupri.c4
-rw-r--r--kernel/sched/cputime.c5
-rw-r--r--kernel/sched/debug.c37
-rw-r--r--kernel/sched/fair.c191
-rw-r--r--kernel/sched/proc.c591
-rw-r--r--kernel/sched/rt.c132
-rw-r--r--kernel/sched/sched.h71
-rw-r--r--kernel/sched/stats.h47
-rw-r--r--kernel/sched/stop_task.c8
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/smpboot.c2
-rw-r--r--kernel/softirq.c18
-rw-r--r--kernel/sys.c352
-rw-r--r--kernel/sysctl.c29
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/alarmtimer.c47
-rw-r--r--kernel/time/clockevents.c271
-rw-r--r--kernel/time/clocksource.c266
-rw-r--r--kernel/time/sched_clock.c212
-rw-r--r--kernel/time/tick-broadcast.c129
-rw-r--r--kernel/time/tick-common.c197
-rw-r--r--kernel/time/tick-internal.h17
-rw-r--r--kernel/time/tick-sched.c26
-rw-r--r--kernel/time/timekeeping.c65
-rw-r--r--kernel/time/timekeeping_debug.c72
-rw-r--r--kernel/time/timekeeping_internal.h14
-rw-r--r--kernel/timer.c18
-rw-r--r--kernel/trace/ftrace.c143
-rw-r--r--kernel/trace/ring_buffer.c26
-rw-r--r--kernel/trace/trace.c496
-rw-r--r--kernel/trace/trace.h35
-rw-r--r--kernel/trace/trace_event_perf.c10
-rw-r--r--kernel/trace/trace_events.c406
-rw-r--r--kernel/trace/trace_events_filter.c27
-rw-r--r--kernel/trace/trace_functions.c105
-rw-r--r--kernel/trace/trace_functions_graph.c54
-rw-r--r--kernel/trace/trace_irqsoff.c4
-rw-r--r--kernel/trace/trace_kprobe.c230
-rw-r--r--kernel/trace/trace_mmiotrace.c8
-rw-r--r--kernel/trace/trace_output.c14
-rw-r--r--kernel/trace/trace_selftest.c18
-rw-r--r--kernel/trace/trace_syscalls.c47
-rw-r--r--kernel/trace/trace_uprobe.c57
-rw-r--r--kernel/user_namespace.c17
-rw-r--r--kernel/wait.c89
-rw-r--r--kernel/watchdog.c113
-rw-r--r--kernel/workqueue.c74
-rw-r--r--kernel/workqueue_internal.h2
108 files changed, 6967 insertions, 5470 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 44511d100eaa..d2b32ac27a39 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -138,7 +138,7 @@ config INLINE_SPIN_UNLOCK_BH
138 138
139config INLINE_SPIN_UNLOCK_IRQ 139config INLINE_SPIN_UNLOCK_IRQ
140 def_bool y 140 def_bool y
141 depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH 141 depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ
142 142
143config INLINE_SPIN_UNLOCK_IRQRESTORE 143config INLINE_SPIN_UNLOCK_IRQRESTORE
144 def_bool y 144 def_bool y
@@ -175,7 +175,7 @@ config INLINE_READ_UNLOCK_BH
175 175
176config INLINE_READ_UNLOCK_IRQ 176config INLINE_READ_UNLOCK_IRQ
177 def_bool y 177 def_bool y
178 depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH 178 depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ
179 179
180config INLINE_READ_UNLOCK_IRQRESTORE 180config INLINE_READ_UNLOCK_IRQRESTORE
181 def_bool y 181 def_bool y
@@ -212,7 +212,7 @@ config INLINE_WRITE_UNLOCK_BH
212 212
213config INLINE_WRITE_UNLOCK_IRQ 213config INLINE_WRITE_UNLOCK_IRQ
214 def_bool y 214 def_bool y
215 depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH 215 depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ
216 216
217config INLINE_WRITE_UNLOCK_IRQRESTORE 217config INLINE_WRITE_UNLOCK_IRQRESTORE
218 def_bool y 218 def_bool y
diff --git a/kernel/Makefile b/kernel/Makefile
index 271fd3119af9..35ef1185e359 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,14 +2,14 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5obj-y = fork.o exec_domain.o panic.o printk.o \ 5obj-y = fork.o exec_domain.o panic.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o cred.o \ 12 notifier.o ksysfs.o cred.o reboot.o \
13 async.o range.o groups.o lglock.o smpboot.o 13 async.o range.o groups.o lglock.o smpboot.o
14 14
15ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
@@ -24,6 +24,7 @@ endif
24 24
25obj-y += sched/ 25obj-y += sched/
26obj-y += power/ 26obj-y += power/
27obj-y += printk/
27obj-y += cpu/ 28obj-y += cpu/
28 29
29obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o 30obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
diff --git a/kernel/audit.h b/kernel/audit.h
index 1c95131ef760..123c9b7c3979 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -85,6 +85,7 @@ struct audit_names {
85 85
86 struct filename *name; 86 struct filename *name;
87 int name_len; /* number of chars to log */ 87 int name_len; /* number of chars to log */
88 bool hidden; /* don't log this record */
88 bool name_put; /* call __putname()? */ 89 bool name_put; /* call __putname()? */
89 90
90 unsigned long ino; 91 unsigned long ino;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 6bd4a90d1991..f7aee8be7fb2 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -423,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
423 f->lsm_rule = NULL; 423 f->lsm_rule = NULL;
424 424
425 /* Support legacy tests for a valid loginuid */ 425 /* Support legacy tests for a valid loginuid */
426 if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) { 426 if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) {
427 f->type = AUDIT_LOGINUID_SET; 427 f->type = AUDIT_LOGINUID_SET;
428 f->val = 0; 428 f->val = 0;
429 } 429 }
@@ -865,6 +865,12 @@ static inline int audit_add_rule(struct audit_entry *entry)
865 err = audit_add_watch(&entry->rule, &list); 865 err = audit_add_watch(&entry->rule, &list);
866 if (err) { 866 if (err) {
867 mutex_unlock(&audit_filter_mutex); 867 mutex_unlock(&audit_filter_mutex);
868 /*
869 * normally audit_add_tree_rule() will free it
870 * on failure
871 */
872 if (tree)
873 audit_put_tree(tree);
868 goto error; 874 goto error;
869 } 875 }
870 } 876 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3c8a601324a2..9845cb32b60a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1399,8 +1399,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1399 } 1399 }
1400 1400
1401 i = 0; 1401 i = 0;
1402 list_for_each_entry(n, &context->names_list, list) 1402 list_for_each_entry(n, &context->names_list, list) {
1403 if (n->hidden)
1404 continue;
1403 audit_log_name(context, n, NULL, i++, &call_panic); 1405 audit_log_name(context, n, NULL, i++, &call_panic);
1406 }
1404 1407
1405 /* Send end of event record to help user space know we are finished */ 1408 /* Send end of event record to help user space know we are finished */
1406 ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); 1409 ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1769,14 +1772,15 @@ void audit_putname(struct filename *name)
1769 * __audit_inode - store the inode and device from a lookup 1772 * __audit_inode - store the inode and device from a lookup
1770 * @name: name being audited 1773 * @name: name being audited
1771 * @dentry: dentry being audited 1774 * @dentry: dentry being audited
1772 * @parent: does this dentry represent the parent? 1775 * @flags: attributes for this particular entry
1773 */ 1776 */
1774void __audit_inode(struct filename *name, const struct dentry *dentry, 1777void __audit_inode(struct filename *name, const struct dentry *dentry,
1775 unsigned int parent) 1778 unsigned int flags)
1776{ 1779{
1777 struct audit_context *context = current->audit_context; 1780 struct audit_context *context = current->audit_context;
1778 const struct inode *inode = dentry->d_inode; 1781 const struct inode *inode = dentry->d_inode;
1779 struct audit_names *n; 1782 struct audit_names *n;
1783 bool parent = flags & AUDIT_INODE_PARENT;
1780 1784
1781 if (!context->in_syscall) 1785 if (!context->in_syscall)
1782 return; 1786 return;
@@ -1831,6 +1835,8 @@ out:
1831 if (parent) { 1835 if (parent) {
1832 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; 1836 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
1833 n->type = AUDIT_TYPE_PARENT; 1837 n->type = AUDIT_TYPE_PARENT;
1838 if (flags & AUDIT_INODE_HIDDEN)
1839 n->hidden = true;
1834 } else { 1840 } else {
1835 n->name_len = AUDIT_NAME_FULL; 1841 n->name_len = AUDIT_NAME_FULL;
1836 n->type = AUDIT_TYPE_NORMAL; 1842 n->type = AUDIT_TYPE_NORMAL;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a7c9e6ddb979..781845a013ab 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,9 +63,6 @@
63 63
64#include <linux/atomic.h> 64#include <linux/atomic.h>
65 65
66/* css deactivation bias, makes css->refcnt negative to deny new trygets */
67#define CSS_DEACT_BIAS INT_MIN
68
69/* 66/*
70 * cgroup_mutex is the master lock. Any modification to cgroup or its 67 * cgroup_mutex is the master lock. Any modification to cgroup or its
71 * hierarchy must be performed while holding it. 68 * hierarchy must be performed while holding it.
@@ -99,16 +96,19 @@ static DEFINE_MUTEX(cgroup_root_mutex);
99 */ 96 */
100#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, 97#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
101#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) 98#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
102static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { 99static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = {
103#include <linux/cgroup_subsys.h> 100#include <linux/cgroup_subsys.h>
104}; 101};
105 102
106/* 103/*
107 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the 104 * The dummy hierarchy, reserved for the subsystems that are otherwise
108 * subsystems that are otherwise unattached - it never has more than a 105 * unattached - it never has more than a single cgroup, and all tasks are
109 * single cgroup, and all tasks are part of that cgroup. 106 * part of that cgroup.
110 */ 107 */
111static struct cgroupfs_root rootnode; 108static struct cgroupfs_root cgroup_dummy_root;
109
110/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
111static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
112 112
113/* 113/*
114 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. 114 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
@@ -186,18 +186,28 @@ struct cgroup_event {
186 186
187/* The list of hierarchy roots */ 187/* The list of hierarchy roots */
188 188
189static LIST_HEAD(roots); 189static LIST_HEAD(cgroup_roots);
190static int root_count; 190static int cgroup_root_count;
191 191
192static DEFINE_IDA(hierarchy_ida); 192/*
193static int next_hierarchy_id; 193 * Hierarchy ID allocation and mapping. It follows the same exclusion
194static DEFINE_SPINLOCK(hierarchy_id_lock); 194 * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
195 195 * writes, either for reads.
196/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 196 */
197#define dummytop (&rootnode.top_cgroup) 197static DEFINE_IDR(cgroup_hierarchy_idr);
198 198
199static struct cgroup_name root_cgroup_name = { .name = "/" }; 199static struct cgroup_name root_cgroup_name = { .name = "/" };
200 200
201/*
202 * Assign a monotonically increasing serial number to cgroups. It
203 * guarantees cgroups with bigger numbers are newer than those with smaller
204 * numbers. Also, as cgroups are always appended to the parent's
205 * ->children list, it guarantees that sibling cgroups are always sorted in
206 * the ascending serial number order on the list. Protected by
207 * cgroup_mutex.
208 */
209static u64 cgroup_serial_nr_next = 1;
210
201/* This flag indicates whether tasks in the fork and exit paths should 211/* This flag indicates whether tasks in the fork and exit paths should
202 * check for fork/exit handlers to call. This avoids us having to do 212 * check for fork/exit handlers to call. This avoids us having to do
203 * extra work in the fork/exit path if none of the subsystems need to 213 * extra work in the fork/exit path if none of the subsystems need to
@@ -205,27 +215,15 @@ static struct cgroup_name root_cgroup_name = { .name = "/" };
205 */ 215 */
206static int need_forkexit_callback __read_mostly; 216static int need_forkexit_callback __read_mostly;
207 217
218static void cgroup_offline_fn(struct work_struct *work);
208static int cgroup_destroy_locked(struct cgroup *cgrp); 219static int cgroup_destroy_locked(struct cgroup *cgrp);
209static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 220static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
210 struct cftype cfts[], bool is_add); 221 struct cftype cfts[], bool is_add);
211 222
212static int css_unbias_refcnt(int refcnt)
213{
214 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
215}
216
217/* the current nr of refs, always >= 0 whether @css is deactivated or not */
218static int css_refcnt(struct cgroup_subsys_state *css)
219{
220 int v = atomic_read(&css->refcnt);
221
222 return css_unbias_refcnt(v);
223}
224
225/* convenient tests for these bits */ 223/* convenient tests for these bits */
226inline int cgroup_is_removed(const struct cgroup *cgrp) 224static inline bool cgroup_is_dead(const struct cgroup *cgrp)
227{ 225{
228 return test_bit(CGRP_REMOVED, &cgrp->flags); 226 return test_bit(CGRP_DEAD, &cgrp->flags);
229} 227}
230 228
231/** 229/**
@@ -261,16 +259,38 @@ static int notify_on_release(const struct cgroup *cgrp)
261 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 259 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
262} 260}
263 261
264/* 262/**
265 * for_each_subsys() allows you to iterate on each subsystem attached to 263 * for_each_subsys - iterate all loaded cgroup subsystems
266 * an active hierarchy 264 * @ss: the iteration cursor
265 * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
266 *
267 * Should be called under cgroup_mutex.
267 */ 268 */
268#define for_each_subsys(_root, _ss) \ 269#define for_each_subsys(ss, i) \
269list_for_each_entry(_ss, &_root->subsys_list, sibling) 270 for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \
271 if (({ lockdep_assert_held(&cgroup_mutex); \
272 !((ss) = cgroup_subsys[i]); })) { } \
273 else
270 274
271/* for_each_active_root() allows you to iterate across the active hierarchies */ 275/**
272#define for_each_active_root(_root) \ 276 * for_each_builtin_subsys - iterate all built-in cgroup subsystems
273list_for_each_entry(_root, &roots, root_list) 277 * @ss: the iteration cursor
278 * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
279 *
280 * Bulit-in subsystems are always present and iteration itself doesn't
281 * require any synchronization.
282 */
283#define for_each_builtin_subsys(ss, i) \
284 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
285 (((ss) = cgroup_subsys[i]) || true); (i)++)
286
287/* iterate each subsystem attached to a hierarchy */
288#define for_each_root_subsys(root, ss) \
289 list_for_each_entry((ss), &(root)->subsys_list, sibling)
290
291/* iterate across the active hierarchies */
292#define for_each_active_root(root) \
293 list_for_each_entry((root), &cgroup_roots, root_list)
274 294
275static inline struct cgroup *__d_cgrp(struct dentry *dentry) 295static inline struct cgroup *__d_cgrp(struct dentry *dentry)
276{ 296{
@@ -297,7 +317,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
297static bool cgroup_lock_live_group(struct cgroup *cgrp) 317static bool cgroup_lock_live_group(struct cgroup *cgrp)
298{ 318{
299 mutex_lock(&cgroup_mutex); 319 mutex_lock(&cgroup_mutex);
300 if (cgroup_is_removed(cgrp)) { 320 if (cgroup_is_dead(cgrp)) {
301 mutex_unlock(&cgroup_mutex); 321 mutex_unlock(&cgroup_mutex);
302 return false; 322 return false;
303 } 323 }
@@ -312,20 +332,24 @@ static void cgroup_release_agent(struct work_struct *work);
312static DECLARE_WORK(release_agent_work, cgroup_release_agent); 332static DECLARE_WORK(release_agent_work, cgroup_release_agent);
313static void check_for_release(struct cgroup *cgrp); 333static void check_for_release(struct cgroup *cgrp);
314 334
315/* Link structure for associating css_set objects with cgroups */ 335/*
316struct cg_cgroup_link { 336 * A cgroup can be associated with multiple css_sets as different tasks may
317 /* 337 * belong to different cgroups on different hierarchies. In the other
318 * List running through cg_cgroup_links associated with a 338 * direction, a css_set is naturally associated with multiple cgroups.
319 * cgroup, anchored on cgroup->css_sets 339 * This M:N relationship is represented by the following link structure
320 */ 340 * which exists for each association and allows traversing the associations
321 struct list_head cgrp_link_list; 341 * from both sides.
322 struct cgroup *cgrp; 342 */
323 /* 343struct cgrp_cset_link {
324 * List running through cg_cgroup_links pointing at a 344 /* the cgroup and css_set this link associates */
325 * single css_set object, anchored on css_set->cg_links 345 struct cgroup *cgrp;
326 */ 346 struct css_set *cset;
327 struct list_head cg_link_list; 347
328 struct css_set *cg; 348 /* list of cgrp_cset_links anchored at cgrp->cset_links */
349 struct list_head cset_link;
350
351 /* list of cgrp_cset_links anchored at css_set->cgrp_links */
352 struct list_head cgrp_link;
329}; 353};
330 354
331/* The default css_set - used by init and its children prior to any 355/* The default css_set - used by init and its children prior to any
@@ -336,7 +360,7 @@ struct cg_cgroup_link {
336 */ 360 */
337 361
338static struct css_set init_css_set; 362static struct css_set init_css_set;
339static struct cg_cgroup_link init_css_set_link; 363static struct cgrp_cset_link init_cgrp_cset_link;
340 364
341static int cgroup_init_idr(struct cgroup_subsys *ss, 365static int cgroup_init_idr(struct cgroup_subsys *ss,
342 struct cgroup_subsys_state *css); 366 struct cgroup_subsys_state *css);
@@ -357,10 +381,11 @@ static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
357 381
358static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) 382static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
359{ 383{
360 int i;
361 unsigned long key = 0UL; 384 unsigned long key = 0UL;
385 struct cgroup_subsys *ss;
386 int i;
362 387
363 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) 388 for_each_subsys(ss, i)
364 key += (unsigned long)css[i]; 389 key += (unsigned long)css[i];
365 key = (key >> 16) ^ key; 390 key = (key >> 16) ^ key;
366 391
@@ -373,90 +398,83 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
373 * compiled into their kernel but not actually in use */ 398 * compiled into their kernel but not actually in use */
374static int use_task_css_set_links __read_mostly; 399static int use_task_css_set_links __read_mostly;
375 400
376static void __put_css_set(struct css_set *cg, int taskexit) 401static void __put_css_set(struct css_set *cset, int taskexit)
377{ 402{
378 struct cg_cgroup_link *link; 403 struct cgrp_cset_link *link, *tmp_link;
379 struct cg_cgroup_link *saved_link; 404
380 /* 405 /*
381 * Ensure that the refcount doesn't hit zero while any readers 406 * Ensure that the refcount doesn't hit zero while any readers
382 * can see it. Similar to atomic_dec_and_lock(), but for an 407 * can see it. Similar to atomic_dec_and_lock(), but for an
383 * rwlock 408 * rwlock
384 */ 409 */
385 if (atomic_add_unless(&cg->refcount, -1, 1)) 410 if (atomic_add_unless(&cset->refcount, -1, 1))
386 return; 411 return;
387 write_lock(&css_set_lock); 412 write_lock(&css_set_lock);
388 if (!atomic_dec_and_test(&cg->refcount)) { 413 if (!atomic_dec_and_test(&cset->refcount)) {
389 write_unlock(&css_set_lock); 414 write_unlock(&css_set_lock);
390 return; 415 return;
391 } 416 }
392 417
393 /* This css_set is dead. unlink it and release cgroup refcounts */ 418 /* This css_set is dead. unlink it and release cgroup refcounts */
394 hash_del(&cg->hlist); 419 hash_del(&cset->hlist);
395 css_set_count--; 420 css_set_count--;
396 421
397 list_for_each_entry_safe(link, saved_link, &cg->cg_links, 422 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
398 cg_link_list) {
399 struct cgroup *cgrp = link->cgrp; 423 struct cgroup *cgrp = link->cgrp;
400 list_del(&link->cg_link_list);
401 list_del(&link->cgrp_link_list);
402 424
403 /* 425 list_del(&link->cset_link);
404 * We may not be holding cgroup_mutex, and if cgrp->count is 426 list_del(&link->cgrp_link);
405 * dropped to 0 the cgroup can be destroyed at any time, hence 427
406 * rcu_read_lock is used to keep it alive. 428 /* @cgrp can't go away while we're holding css_set_lock */
407 */ 429 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
408 rcu_read_lock();
409 if (atomic_dec_and_test(&cgrp->count) &&
410 notify_on_release(cgrp)) {
411 if (taskexit) 430 if (taskexit)
412 set_bit(CGRP_RELEASABLE, &cgrp->flags); 431 set_bit(CGRP_RELEASABLE, &cgrp->flags);
413 check_for_release(cgrp); 432 check_for_release(cgrp);
414 } 433 }
415 rcu_read_unlock();
416 434
417 kfree(link); 435 kfree(link);
418 } 436 }
419 437
420 write_unlock(&css_set_lock); 438 write_unlock(&css_set_lock);
421 kfree_rcu(cg, rcu_head); 439 kfree_rcu(cset, rcu_head);
422} 440}
423 441
424/* 442/*
425 * refcounted get/put for css_set objects 443 * refcounted get/put for css_set objects
426 */ 444 */
427static inline void get_css_set(struct css_set *cg) 445static inline void get_css_set(struct css_set *cset)
428{ 446{
429 atomic_inc(&cg->refcount); 447 atomic_inc(&cset->refcount);
430} 448}
431 449
432static inline void put_css_set(struct css_set *cg) 450static inline void put_css_set(struct css_set *cset)
433{ 451{
434 __put_css_set(cg, 0); 452 __put_css_set(cset, 0);
435} 453}
436 454
437static inline void put_css_set_taskexit(struct css_set *cg) 455static inline void put_css_set_taskexit(struct css_set *cset)
438{ 456{
439 __put_css_set(cg, 1); 457 __put_css_set(cset, 1);
440} 458}
441 459
442/* 460/**
443 * compare_css_sets - helper function for find_existing_css_set(). 461 * compare_css_sets - helper function for find_existing_css_set().
444 * @cg: candidate css_set being tested 462 * @cset: candidate css_set being tested
445 * @old_cg: existing css_set for a task 463 * @old_cset: existing css_set for a task
446 * @new_cgrp: cgroup that's being entered by the task 464 * @new_cgrp: cgroup that's being entered by the task
447 * @template: desired set of css pointers in css_set (pre-calculated) 465 * @template: desired set of css pointers in css_set (pre-calculated)
448 * 466 *
449 * Returns true if "cg" matches "old_cg" except for the hierarchy 467 * Returns true if "cg" matches "old_cg" except for the hierarchy
450 * which "new_cgrp" belongs to, for which it should match "new_cgrp". 468 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
451 */ 469 */
452static bool compare_css_sets(struct css_set *cg, 470static bool compare_css_sets(struct css_set *cset,
453 struct css_set *old_cg, 471 struct css_set *old_cset,
454 struct cgroup *new_cgrp, 472 struct cgroup *new_cgrp,
455 struct cgroup_subsys_state *template[]) 473 struct cgroup_subsys_state *template[])
456{ 474{
457 struct list_head *l1, *l2; 475 struct list_head *l1, *l2;
458 476
459 if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { 477 if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
460 /* Not all subsystems matched */ 478 /* Not all subsystems matched */
461 return false; 479 return false;
462 } 480 }
@@ -470,28 +488,28 @@ static bool compare_css_sets(struct css_set *cg,
470 * candidates. 488 * candidates.
471 */ 489 */
472 490
473 l1 = &cg->cg_links; 491 l1 = &cset->cgrp_links;
474 l2 = &old_cg->cg_links; 492 l2 = &old_cset->cgrp_links;
475 while (1) { 493 while (1) {
476 struct cg_cgroup_link *cgl1, *cgl2; 494 struct cgrp_cset_link *link1, *link2;
477 struct cgroup *cg1, *cg2; 495 struct cgroup *cgrp1, *cgrp2;
478 496
479 l1 = l1->next; 497 l1 = l1->next;
480 l2 = l2->next; 498 l2 = l2->next;
481 /* See if we reached the end - both lists are equal length. */ 499 /* See if we reached the end - both lists are equal length. */
482 if (l1 == &cg->cg_links) { 500 if (l1 == &cset->cgrp_links) {
483 BUG_ON(l2 != &old_cg->cg_links); 501 BUG_ON(l2 != &old_cset->cgrp_links);
484 break; 502 break;
485 } else { 503 } else {
486 BUG_ON(l2 == &old_cg->cg_links); 504 BUG_ON(l2 == &old_cset->cgrp_links);
487 } 505 }
488 /* Locate the cgroups associated with these links. */ 506 /* Locate the cgroups associated with these links. */
489 cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); 507 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
490 cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); 508 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
491 cg1 = cgl1->cgrp; 509 cgrp1 = link1->cgrp;
492 cg2 = cgl2->cgrp; 510 cgrp2 = link2->cgrp;
493 /* Hierarchies should be linked in the same order. */ 511 /* Hierarchies should be linked in the same order. */
494 BUG_ON(cg1->root != cg2->root); 512 BUG_ON(cgrp1->root != cgrp2->root);
495 513
496 /* 514 /*
497 * If this hierarchy is the hierarchy of the cgroup 515 * If this hierarchy is the hierarchy of the cgroup
@@ -500,46 +518,39 @@ static bool compare_css_sets(struct css_set *cg,
500 * hierarchy, then this css_set should point to the 518 * hierarchy, then this css_set should point to the
501 * same cgroup as the old css_set. 519 * same cgroup as the old css_set.
502 */ 520 */
503 if (cg1->root == new_cgrp->root) { 521 if (cgrp1->root == new_cgrp->root) {
504 if (cg1 != new_cgrp) 522 if (cgrp1 != new_cgrp)
505 return false; 523 return false;
506 } else { 524 } else {
507 if (cg1 != cg2) 525 if (cgrp1 != cgrp2)
508 return false; 526 return false;
509 } 527 }
510 } 528 }
511 return true; 529 return true;
512} 530}
513 531
514/* 532/**
515 * find_existing_css_set() is a helper for 533 * find_existing_css_set - init css array and find the matching css_set
516 * find_css_set(), and checks to see whether an existing 534 * @old_cset: the css_set that we're using before the cgroup transition
517 * css_set is suitable. 535 * @cgrp: the cgroup that we're moving into
518 * 536 * @template: out param for the new set of csses, should be clear on entry
519 * oldcg: the cgroup group that we're using before the cgroup
520 * transition
521 *
522 * cgrp: the cgroup that we're moving into
523 *
524 * template: location in which to build the desired set of subsystem
525 * state objects for the new cgroup group
526 */ 537 */
527static struct css_set *find_existing_css_set( 538static struct css_set *find_existing_css_set(struct css_set *old_cset,
528 struct css_set *oldcg, 539 struct cgroup *cgrp,
529 struct cgroup *cgrp, 540 struct cgroup_subsys_state *template[])
530 struct cgroup_subsys_state *template[])
531{ 541{
532 int i;
533 struct cgroupfs_root *root = cgrp->root; 542 struct cgroupfs_root *root = cgrp->root;
534 struct css_set *cg; 543 struct cgroup_subsys *ss;
544 struct css_set *cset;
535 unsigned long key; 545 unsigned long key;
546 int i;
536 547
537 /* 548 /*
538 * Build the set of subsystem state objects that we want to see in the 549 * Build the set of subsystem state objects that we want to see in the
539 * new css_set. while subsystems can change globally, the entries here 550 * new css_set. while subsystems can change globally, the entries here
540 * won't change, so no need for locking. 551 * won't change, so no need for locking.
541 */ 552 */
542 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 553 for_each_subsys(ss, i) {
543 if (root->subsys_mask & (1UL << i)) { 554 if (root->subsys_mask & (1UL << i)) {
544 /* Subsystem is in this hierarchy. So we want 555 /* Subsystem is in this hierarchy. So we want
545 * the subsystem state from the new 556 * the subsystem state from the new
@@ -548,148 +559,152 @@ static struct css_set *find_existing_css_set(
548 } else { 559 } else {
549 /* Subsystem is not in this hierarchy, so we 560 /* Subsystem is not in this hierarchy, so we
550 * don't want to change the subsystem state */ 561 * don't want to change the subsystem state */
551 template[i] = oldcg->subsys[i]; 562 template[i] = old_cset->subsys[i];
552 } 563 }
553 } 564 }
554 565
555 key = css_set_hash(template); 566 key = css_set_hash(template);
556 hash_for_each_possible(css_set_table, cg, hlist, key) { 567 hash_for_each_possible(css_set_table, cset, hlist, key) {
557 if (!compare_css_sets(cg, oldcg, cgrp, template)) 568 if (!compare_css_sets(cset, old_cset, cgrp, template))
558 continue; 569 continue;
559 570
560 /* This css_set matches what we need */ 571 /* This css_set matches what we need */
561 return cg; 572 return cset;
562 } 573 }
563 574
564 /* No existing cgroup group matched */ 575 /* No existing cgroup group matched */
565 return NULL; 576 return NULL;
566} 577}
567 578
568static void free_cg_links(struct list_head *tmp) 579static void free_cgrp_cset_links(struct list_head *links_to_free)
569{ 580{
570 struct cg_cgroup_link *link; 581 struct cgrp_cset_link *link, *tmp_link;
571 struct cg_cgroup_link *saved_link;
572 582
573 list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { 583 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
574 list_del(&link->cgrp_link_list); 584 list_del(&link->cset_link);
575 kfree(link); 585 kfree(link);
576 } 586 }
577} 587}
578 588
579/* 589/**
580 * allocate_cg_links() allocates "count" cg_cgroup_link structures 590 * allocate_cgrp_cset_links - allocate cgrp_cset_links
581 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on 591 * @count: the number of links to allocate
582 * success or a negative error 592 * @tmp_links: list_head the allocated links are put on
593 *
594 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
595 * through ->cset_link. Returns 0 on success or -errno.
583 */ 596 */
584static int allocate_cg_links(int count, struct list_head *tmp) 597static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
585{ 598{
586 struct cg_cgroup_link *link; 599 struct cgrp_cset_link *link;
587 int i; 600 int i;
588 INIT_LIST_HEAD(tmp); 601
602 INIT_LIST_HEAD(tmp_links);
603
589 for (i = 0; i < count; i++) { 604 for (i = 0; i < count; i++) {
590 link = kmalloc(sizeof(*link), GFP_KERNEL); 605 link = kzalloc(sizeof(*link), GFP_KERNEL);
591 if (!link) { 606 if (!link) {
592 free_cg_links(tmp); 607 free_cgrp_cset_links(tmp_links);
593 return -ENOMEM; 608 return -ENOMEM;
594 } 609 }
595 list_add(&link->cgrp_link_list, tmp); 610 list_add(&link->cset_link, tmp_links);
596 } 611 }
597 return 0; 612 return 0;
598} 613}
599 614
600/** 615/**
601 * link_css_set - a helper function to link a css_set to a cgroup 616 * link_css_set - a helper function to link a css_set to a cgroup
602 * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() 617 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
603 * @cg: the css_set to be linked 618 * @cset: the css_set to be linked
604 * @cgrp: the destination cgroup 619 * @cgrp: the destination cgroup
605 */ 620 */
606static void link_css_set(struct list_head *tmp_cg_links, 621static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
607 struct css_set *cg, struct cgroup *cgrp) 622 struct cgroup *cgrp)
608{ 623{
609 struct cg_cgroup_link *link; 624 struct cgrp_cset_link *link;
610 625
611 BUG_ON(list_empty(tmp_cg_links)); 626 BUG_ON(list_empty(tmp_links));
612 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, 627 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
613 cgrp_link_list); 628 link->cset = cset;
614 link->cg = cg;
615 link->cgrp = cgrp; 629 link->cgrp = cgrp;
616 atomic_inc(&cgrp->count); 630 list_move(&link->cset_link, &cgrp->cset_links);
617 list_move(&link->cgrp_link_list, &cgrp->css_sets);
618 /* 631 /*
619 * Always add links to the tail of the list so that the list 632 * Always add links to the tail of the list so that the list
620 * is sorted by order of hierarchy creation 633 * is sorted by order of hierarchy creation
621 */ 634 */
622 list_add_tail(&link->cg_link_list, &cg->cg_links); 635 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
623} 636}
624 637
625/* 638/**
626 * find_css_set() takes an existing cgroup group and a 639 * find_css_set - return a new css_set with one cgroup updated
627 * cgroup object, and returns a css_set object that's 640 * @old_cset: the baseline css_set
628 * equivalent to the old group, but with the given cgroup 641 * @cgrp: the cgroup to be updated
629 * substituted into the appropriate hierarchy. Must be called with 642 *
630 * cgroup_mutex held 643 * Return a new css_set that's equivalent to @old_cset, but with @cgrp
644 * substituted into the appropriate hierarchy.
631 */ 645 */
632static struct css_set *find_css_set( 646static struct css_set *find_css_set(struct css_set *old_cset,
633 struct css_set *oldcg, struct cgroup *cgrp) 647 struct cgroup *cgrp)
634{ 648{
635 struct css_set *res; 649 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
636 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 650 struct css_set *cset;
637 651 struct list_head tmp_links;
638 struct list_head tmp_cg_links; 652 struct cgrp_cset_link *link;
639
640 struct cg_cgroup_link *link;
641 unsigned long key; 653 unsigned long key;
642 654
655 lockdep_assert_held(&cgroup_mutex);
656
643 /* First see if we already have a cgroup group that matches 657 /* First see if we already have a cgroup group that matches
644 * the desired set */ 658 * the desired set */
645 read_lock(&css_set_lock); 659 read_lock(&css_set_lock);
646 res = find_existing_css_set(oldcg, cgrp, template); 660 cset = find_existing_css_set(old_cset, cgrp, template);
647 if (res) 661 if (cset)
648 get_css_set(res); 662 get_css_set(cset);
649 read_unlock(&css_set_lock); 663 read_unlock(&css_set_lock);
650 664
651 if (res) 665 if (cset)
652 return res; 666 return cset;
653 667
654 res = kmalloc(sizeof(*res), GFP_KERNEL); 668 cset = kzalloc(sizeof(*cset), GFP_KERNEL);
655 if (!res) 669 if (!cset)
656 return NULL; 670 return NULL;
657 671
658 /* Allocate all the cg_cgroup_link objects that we'll need */ 672 /* Allocate all the cgrp_cset_link objects that we'll need */
659 if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { 673 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
660 kfree(res); 674 kfree(cset);
661 return NULL; 675 return NULL;
662 } 676 }
663 677
664 atomic_set(&res->refcount, 1); 678 atomic_set(&cset->refcount, 1);
665 INIT_LIST_HEAD(&res->cg_links); 679 INIT_LIST_HEAD(&cset->cgrp_links);
666 INIT_LIST_HEAD(&res->tasks); 680 INIT_LIST_HEAD(&cset->tasks);
667 INIT_HLIST_NODE(&res->hlist); 681 INIT_HLIST_NODE(&cset->hlist);
668 682
669 /* Copy the set of subsystem state objects generated in 683 /* Copy the set of subsystem state objects generated in
670 * find_existing_css_set() */ 684 * find_existing_css_set() */
671 memcpy(res->subsys, template, sizeof(res->subsys)); 685 memcpy(cset->subsys, template, sizeof(cset->subsys));
672 686
673 write_lock(&css_set_lock); 687 write_lock(&css_set_lock);
674 /* Add reference counts and links from the new css_set. */ 688 /* Add reference counts and links from the new css_set. */
675 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { 689 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
676 struct cgroup *c = link->cgrp; 690 struct cgroup *c = link->cgrp;
691
677 if (c->root == cgrp->root) 692 if (c->root == cgrp->root)
678 c = cgrp; 693 c = cgrp;
679 link_css_set(&tmp_cg_links, res, c); 694 link_css_set(&tmp_links, cset, c);
680 } 695 }
681 696
682 BUG_ON(!list_empty(&tmp_cg_links)); 697 BUG_ON(!list_empty(&tmp_links));
683 698
684 css_set_count++; 699 css_set_count++;
685 700
686 /* Add this cgroup group to the hash table */ 701 /* Add this cgroup group to the hash table */
687 key = css_set_hash(res->subsys); 702 key = css_set_hash(cset->subsys);
688 hash_add(css_set_table, &res->hlist, key); 703 hash_add(css_set_table, &cset->hlist, key);
689 704
690 write_unlock(&css_set_lock); 705 write_unlock(&css_set_lock);
691 706
692 return res; 707 return cset;
693} 708}
694 709
695/* 710/*
@@ -699,7 +714,7 @@ static struct css_set *find_css_set(
699static struct cgroup *task_cgroup_from_root(struct task_struct *task, 714static struct cgroup *task_cgroup_from_root(struct task_struct *task,
700 struct cgroupfs_root *root) 715 struct cgroupfs_root *root)
701{ 716{
702 struct css_set *css; 717 struct css_set *cset;
703 struct cgroup *res = NULL; 718 struct cgroup *res = NULL;
704 719
705 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 720 BUG_ON(!mutex_is_locked(&cgroup_mutex));
@@ -709,13 +724,15 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
709 * task can't change groups, so the only thing that can happen 724 * task can't change groups, so the only thing that can happen
710 * is that it exits and its css is set back to init_css_set. 725 * is that it exits and its css is set back to init_css_set.
711 */ 726 */
712 css = task->cgroups; 727 cset = task_css_set(task);
713 if (css == &init_css_set) { 728 if (cset == &init_css_set) {
714 res = &root->top_cgroup; 729 res = &root->top_cgroup;
715 } else { 730 } else {
716 struct cg_cgroup_link *link; 731 struct cgrp_cset_link *link;
717 list_for_each_entry(link, &css->cg_links, cg_link_list) { 732
733 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
718 struct cgroup *c = link->cgrp; 734 struct cgroup *c = link->cgrp;
735
719 if (c->root == root) { 736 if (c->root == root) {
720 res = c; 737 res = c;
721 break; 738 break;
@@ -785,7 +802,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
785 */ 802 */
786 803
787static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); 804static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
788static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
789static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 805static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
790static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 806static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
791 unsigned long subsys_mask); 807 unsigned long subsys_mask);
@@ -828,14 +844,14 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
828 844
829static void cgroup_free_fn(struct work_struct *work) 845static void cgroup_free_fn(struct work_struct *work)
830{ 846{
831 struct cgroup *cgrp = container_of(work, struct cgroup, free_work); 847 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
832 struct cgroup_subsys *ss; 848 struct cgroup_subsys *ss;
833 849
834 mutex_lock(&cgroup_mutex); 850 mutex_lock(&cgroup_mutex);
835 /* 851 /*
836 * Release the subsystem state objects. 852 * Release the subsystem state objects.
837 */ 853 */
838 for_each_subsys(cgrp->root, ss) 854 for_each_root_subsys(cgrp->root, ss)
839 ss->css_free(cgrp); 855 ss->css_free(cgrp);
840 856
841 cgrp->root->number_of_cgroups--; 857 cgrp->root->number_of_cgroups--;
@@ -873,7 +889,8 @@ static void cgroup_free_rcu(struct rcu_head *head)
873{ 889{
874 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); 890 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
875 891
876 schedule_work(&cgrp->free_work); 892 INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
893 schedule_work(&cgrp->destroy_work);
877} 894}
878 895
879static void cgroup_diput(struct dentry *dentry, struct inode *inode) 896static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -882,7 +899,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
882 if (S_ISDIR(inode->i_mode)) { 899 if (S_ISDIR(inode->i_mode)) {
883 struct cgroup *cgrp = dentry->d_fsdata; 900 struct cgroup *cgrp = dentry->d_fsdata;
884 901
885 BUG_ON(!(cgroup_is_removed(cgrp))); 902 BUG_ON(!(cgroup_is_dead(cgrp)));
886 call_rcu(&cgrp->rcu_head, cgroup_free_rcu); 903 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
887 } else { 904 } else {
888 struct cfent *cfe = __d_cfe(dentry); 905 struct cfent *cfe = __d_cfe(dentry);
@@ -950,7 +967,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
950 struct cgroup *cgrp = __d_cgrp(dir); 967 struct cgroup *cgrp = __d_cgrp(dir);
951 struct cgroup_subsys *ss; 968 struct cgroup_subsys *ss;
952 969
953 for_each_subsys(cgrp->root, ss) { 970 for_each_root_subsys(cgrp->root, ss) {
954 struct cftype_set *set; 971 struct cftype_set *set;
955 if (!test_bit(ss->subsys_id, &subsys_mask)) 972 if (!test_bit(ss->subsys_id, &subsys_mask))
956 continue; 973 continue;
@@ -988,30 +1005,23 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
988 * returns an error, no reference counts are touched. 1005 * returns an error, no reference counts are touched.
989 */ 1006 */
990static int rebind_subsystems(struct cgroupfs_root *root, 1007static int rebind_subsystems(struct cgroupfs_root *root,
991 unsigned long final_subsys_mask) 1008 unsigned long added_mask, unsigned removed_mask)
992{ 1009{
993 unsigned long added_mask, removed_mask;
994 struct cgroup *cgrp = &root->top_cgroup; 1010 struct cgroup *cgrp = &root->top_cgroup;
1011 struct cgroup_subsys *ss;
995 int i; 1012 int i;
996 1013
997 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1014 BUG_ON(!mutex_is_locked(&cgroup_mutex));
998 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 1015 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
999 1016
1000 removed_mask = root->actual_subsys_mask & ~final_subsys_mask;
1001 added_mask = final_subsys_mask & ~root->actual_subsys_mask;
1002 /* Check that any added subsystems are currently free */ 1017 /* Check that any added subsystems are currently free */
1003 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1018 for_each_subsys(ss, i) {
1004 unsigned long bit = 1UL << i; 1019 unsigned long bit = 1UL << i;
1005 struct cgroup_subsys *ss = subsys[i]; 1020
1006 if (!(bit & added_mask)) 1021 if (!(bit & added_mask))
1007 continue; 1022 continue;
1008 /* 1023
1009 * Nobody should tell us to do a subsys that doesn't exist: 1024 if (ss->root != &cgroup_dummy_root) {
1010 * parse_cgroupfs_options should catch that case and refcounts
1011 * ensure that subsystems won't disappear once selected.
1012 */
1013 BUG_ON(ss == NULL);
1014 if (ss->root != &rootnode) {
1015 /* Subsystem isn't free */ 1025 /* Subsystem isn't free */
1016 return -EBUSY; 1026 return -EBUSY;
1017 } 1027 }
@@ -1025,38 +1035,41 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1025 return -EBUSY; 1035 return -EBUSY;
1026 1036
1027 /* Process each subsystem */ 1037 /* Process each subsystem */
1028 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1038 for_each_subsys(ss, i) {
1029 struct cgroup_subsys *ss = subsys[i];
1030 unsigned long bit = 1UL << i; 1039 unsigned long bit = 1UL << i;
1040
1031 if (bit & added_mask) { 1041 if (bit & added_mask) {
1032 /* We're binding this subsystem to this hierarchy */ 1042 /* We're binding this subsystem to this hierarchy */
1033 BUG_ON(ss == NULL);
1034 BUG_ON(cgrp->subsys[i]); 1043 BUG_ON(cgrp->subsys[i]);
1035 BUG_ON(!dummytop->subsys[i]); 1044 BUG_ON(!cgroup_dummy_top->subsys[i]);
1036 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 1045 BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top);
1037 cgrp->subsys[i] = dummytop->subsys[i]; 1046
1047 cgrp->subsys[i] = cgroup_dummy_top->subsys[i];
1038 cgrp->subsys[i]->cgroup = cgrp; 1048 cgrp->subsys[i]->cgroup = cgrp;
1039 list_move(&ss->sibling, &root->subsys_list); 1049 list_move(&ss->sibling, &root->subsys_list);
1040 ss->root = root; 1050 ss->root = root;
1041 if (ss->bind) 1051 if (ss->bind)
1042 ss->bind(cgrp); 1052 ss->bind(cgrp);
1053
1043 /* refcount was already taken, and we're keeping it */ 1054 /* refcount was already taken, and we're keeping it */
1055 root->subsys_mask |= bit;
1044 } else if (bit & removed_mask) { 1056 } else if (bit & removed_mask) {
1045 /* We're removing this subsystem */ 1057 /* We're removing this subsystem */
1046 BUG_ON(ss == NULL); 1058 BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]);
1047 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
1048 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1059 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
1060
1049 if (ss->bind) 1061 if (ss->bind)
1050 ss->bind(dummytop); 1062 ss->bind(cgroup_dummy_top);
1051 dummytop->subsys[i]->cgroup = dummytop; 1063 cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top;
1052 cgrp->subsys[i] = NULL; 1064 cgrp->subsys[i] = NULL;
1053 subsys[i]->root = &rootnode; 1065 cgroup_subsys[i]->root = &cgroup_dummy_root;
1054 list_move(&ss->sibling, &rootnode.subsys_list); 1066 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1067
1055 /* subsystem is now free - drop reference on module */ 1068 /* subsystem is now free - drop reference on module */
1056 module_put(ss->module); 1069 module_put(ss->module);
1057 } else if (bit & final_subsys_mask) { 1070 root->subsys_mask &= ~bit;
1071 } else if (bit & root->subsys_mask) {
1058 /* Subsystem state should already exist */ 1072 /* Subsystem state should already exist */
1059 BUG_ON(ss == NULL);
1060 BUG_ON(!cgrp->subsys[i]); 1073 BUG_ON(!cgrp->subsys[i]);
1061 /* 1074 /*
1062 * a refcount was taken, but we already had one, so 1075 * a refcount was taken, but we already had one, so
@@ -1071,7 +1084,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1071 BUG_ON(cgrp->subsys[i]); 1084 BUG_ON(cgrp->subsys[i]);
1072 } 1085 }
1073 } 1086 }
1074 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; 1087
1088 /*
1089 * Mark @root has finished binding subsystems. @root->subsys_mask
1090 * now matches the bound subsystems.
1091 */
1092 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1075 1093
1076 return 0; 1094 return 0;
1077} 1095}
@@ -1082,7 +1100,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1082 struct cgroup_subsys *ss; 1100 struct cgroup_subsys *ss;
1083 1101
1084 mutex_lock(&cgroup_root_mutex); 1102 mutex_lock(&cgroup_root_mutex);
1085 for_each_subsys(root, ss) 1103 for_each_root_subsys(root, ss)
1086 seq_printf(seq, ",%s", ss->name); 1104 seq_printf(seq, ",%s", ss->name);
1087 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1105 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1088 seq_puts(seq, ",sane_behavior"); 1106 seq_puts(seq, ",sane_behavior");
@@ -1114,18 +1132,19 @@ struct cgroup_sb_opts {
1114}; 1132};
1115 1133
1116/* 1134/*
1117 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call 1135 * Convert a hierarchy specifier into a bitmask of subsystems and
1118 * with cgroup_mutex held to protect the subsys[] array. This function takes 1136 * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
1119 * refcounts on subsystems to be used, unless it returns error, in which case 1137 * array. This function takes refcounts on subsystems to be used, unless it
1120 * no refcounts are taken. 1138 * returns error, in which case no refcounts are taken.
1121 */ 1139 */
1122static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1140static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1123{ 1141{
1124 char *token, *o = data; 1142 char *token, *o = data;
1125 bool all_ss = false, one_ss = false; 1143 bool all_ss = false, one_ss = false;
1126 unsigned long mask = (unsigned long)-1; 1144 unsigned long mask = (unsigned long)-1;
1127 int i;
1128 bool module_pin_failed = false; 1145 bool module_pin_failed = false;
1146 struct cgroup_subsys *ss;
1147 int i;
1129 1148
1130 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1149 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1131 1150
@@ -1202,10 +1221,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1202 continue; 1221 continue;
1203 } 1222 }
1204 1223
1205 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1224 for_each_subsys(ss, i) {
1206 struct cgroup_subsys *ss = subsys[i];
1207 if (ss == NULL)
1208 continue;
1209 if (strcmp(token, ss->name)) 1225 if (strcmp(token, ss->name))
1210 continue; 1226 continue;
1211 if (ss->disabled) 1227 if (ss->disabled)
@@ -1228,16 +1244,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1228 * otherwise if 'none', 'name=' and a subsystem name options 1244 * otherwise if 'none', 'name=' and a subsystem name options
1229 * were not specified, let's default to 'all' 1245 * were not specified, let's default to 'all'
1230 */ 1246 */
1231 if (all_ss || (!one_ss && !opts->none && !opts->name)) { 1247 if (all_ss || (!one_ss && !opts->none && !opts->name))
1232 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1248 for_each_subsys(ss, i)
1233 struct cgroup_subsys *ss = subsys[i]; 1249 if (!ss->disabled)
1234 if (ss == NULL) 1250 set_bit(i, &opts->subsys_mask);
1235 continue;
1236 if (ss->disabled)
1237 continue;
1238 set_bit(i, &opts->subsys_mask);
1239 }
1240 }
1241 1251
1242 /* Consistency checks */ 1252 /* Consistency checks */
1243 1253
@@ -1281,12 +1291,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1281 * take duplicate reference counts on a subsystem that's already used, 1291 * take duplicate reference counts on a subsystem that's already used,
1282 * but rebind_subsystems handles this case. 1292 * but rebind_subsystems handles this case.
1283 */ 1293 */
1284 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1294 for_each_subsys(ss, i) {
1285 unsigned long bit = 1UL << i; 1295 if (!(opts->subsys_mask & (1UL << i)))
1286
1287 if (!(bit & opts->subsys_mask))
1288 continue; 1296 continue;
1289 if (!try_module_get(subsys[i]->module)) { 1297 if (!try_module_get(cgroup_subsys[i]->module)) {
1290 module_pin_failed = true; 1298 module_pin_failed = true;
1291 break; 1299 break;
1292 } 1300 }
@@ -1303,7 +1311,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1303 1311
1304 if (!(bit & opts->subsys_mask)) 1312 if (!(bit & opts->subsys_mask))
1305 continue; 1313 continue;
1306 module_put(subsys[i]->module); 1314 module_put(cgroup_subsys[i]->module);
1307 } 1315 }
1308 return -ENOENT; 1316 return -ENOENT;
1309 } 1317 }
@@ -1313,14 +1321,14 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1313 1321
1314static void drop_parsed_module_refcounts(unsigned long subsys_mask) 1322static void drop_parsed_module_refcounts(unsigned long subsys_mask)
1315{ 1323{
1324 struct cgroup_subsys *ss;
1316 int i; 1325 int i;
1317 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1318 unsigned long bit = 1UL << i;
1319 1326
1320 if (!(bit & subsys_mask)) 1327 mutex_lock(&cgroup_mutex);
1321 continue; 1328 for_each_subsys(ss, i)
1322 module_put(subsys[i]->module); 1329 if (subsys_mask & (1UL << i))
1323 } 1330 module_put(cgroup_subsys[i]->module);
1331 mutex_unlock(&cgroup_mutex);
1324} 1332}
1325 1333
1326static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1334static int cgroup_remount(struct super_block *sb, int *flags, char *data)
@@ -1345,7 +1353,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1345 if (ret) 1353 if (ret)
1346 goto out_unlock; 1354 goto out_unlock;
1347 1355
1348 if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) 1356 if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1349 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1357 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1350 task_tgid_nr(current), current->comm); 1358 task_tgid_nr(current), current->comm);
1351 1359
@@ -1353,10 +1361,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1353 removed_mask = root->subsys_mask & ~opts.subsys_mask; 1361 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1354 1362
1355 /* Don't allow flags or name to change at remount */ 1363 /* Don't allow flags or name to change at remount */
1356 if (opts.flags != root->flags || 1364 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
1357 (opts.name && strcmp(opts.name, root->name))) { 1365 (opts.name && strcmp(opts.name, root->name))) {
1366 pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
1367 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
1368 root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1358 ret = -EINVAL; 1369 ret = -EINVAL;
1359 drop_parsed_module_refcounts(opts.subsys_mask);
1360 goto out_unlock; 1370 goto out_unlock;
1361 } 1371 }
1362 1372
@@ -1367,11 +1377,10 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1367 */ 1377 */
1368 cgroup_clear_directory(cgrp->dentry, false, removed_mask); 1378 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1369 1379
1370 ret = rebind_subsystems(root, opts.subsys_mask); 1380 ret = rebind_subsystems(root, added_mask, removed_mask);
1371 if (ret) { 1381 if (ret) {
1372 /* rebind_subsystems failed, re-populate the removed files */ 1382 /* rebind_subsystems failed, re-populate the removed files */
1373 cgroup_populate_dir(cgrp, false, removed_mask); 1383 cgroup_populate_dir(cgrp, false, removed_mask);
1374 drop_parsed_module_refcounts(opts.subsys_mask);
1375 goto out_unlock; 1384 goto out_unlock;
1376 } 1385 }
1377 1386
@@ -1386,6 +1395,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1386 mutex_unlock(&cgroup_root_mutex); 1395 mutex_unlock(&cgroup_root_mutex);
1387 mutex_unlock(&cgroup_mutex); 1396 mutex_unlock(&cgroup_mutex);
1388 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1397 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1398 if (ret)
1399 drop_parsed_module_refcounts(opts.subsys_mask);
1389 return ret; 1400 return ret;
1390} 1401}
1391 1402
@@ -1401,11 +1412,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1401 INIT_LIST_HEAD(&cgrp->sibling); 1412 INIT_LIST_HEAD(&cgrp->sibling);
1402 INIT_LIST_HEAD(&cgrp->children); 1413 INIT_LIST_HEAD(&cgrp->children);
1403 INIT_LIST_HEAD(&cgrp->files); 1414 INIT_LIST_HEAD(&cgrp->files);
1404 INIT_LIST_HEAD(&cgrp->css_sets); 1415 INIT_LIST_HEAD(&cgrp->cset_links);
1405 INIT_LIST_HEAD(&cgrp->allcg_node);
1406 INIT_LIST_HEAD(&cgrp->release_list); 1416 INIT_LIST_HEAD(&cgrp->release_list);
1407 INIT_LIST_HEAD(&cgrp->pidlists); 1417 INIT_LIST_HEAD(&cgrp->pidlists);
1408 INIT_WORK(&cgrp->free_work, cgroup_free_fn);
1409 mutex_init(&cgrp->pidlist_mutex); 1418 mutex_init(&cgrp->pidlist_mutex);
1410 INIT_LIST_HEAD(&cgrp->event_list); 1419 INIT_LIST_HEAD(&cgrp->event_list);
1411 spin_lock_init(&cgrp->event_list_lock); 1420 spin_lock_init(&cgrp->event_list_lock);
@@ -1418,37 +1427,37 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1418 1427
1419 INIT_LIST_HEAD(&root->subsys_list); 1428 INIT_LIST_HEAD(&root->subsys_list);
1420 INIT_LIST_HEAD(&root->root_list); 1429 INIT_LIST_HEAD(&root->root_list);
1421 INIT_LIST_HEAD(&root->allcg_list);
1422 root->number_of_cgroups = 1; 1430 root->number_of_cgroups = 1;
1423 cgrp->root = root; 1431 cgrp->root = root;
1424 cgrp->name = &root_cgroup_name; 1432 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1425 init_cgroup_housekeeping(cgrp); 1433 init_cgroup_housekeeping(cgrp);
1426 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1427} 1434}
1428 1435
1429static bool init_root_id(struct cgroupfs_root *root) 1436static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
1430{ 1437{
1431 int ret = 0; 1438 int id;
1432 1439
1433 do { 1440 lockdep_assert_held(&cgroup_mutex);
1434 if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) 1441 lockdep_assert_held(&cgroup_root_mutex);
1435 return false; 1442
1436 spin_lock(&hierarchy_id_lock); 1443 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
1437 /* Try to allocate the next unused ID */ 1444 GFP_KERNEL);
1438 ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, 1445 if (id < 0)
1439 &root->hierarchy_id); 1446 return id;
1440 if (ret == -ENOSPC) 1447
1441 /* Try again starting from 0 */ 1448 root->hierarchy_id = id;
1442 ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); 1449 return 0;
1443 if (!ret) { 1450}
1444 next_hierarchy_id = root->hierarchy_id + 1; 1451
1445 } else if (ret != -EAGAIN) { 1452static void cgroup_exit_root_id(struct cgroupfs_root *root)
1446 /* Can only get here if the 31-bit IDR is full ... */ 1453{
1447 BUG_ON(ret); 1454 lockdep_assert_held(&cgroup_mutex);
1448 } 1455 lockdep_assert_held(&cgroup_root_mutex);
1449 spin_unlock(&hierarchy_id_lock); 1456
1450 } while (ret); 1457 if (root->hierarchy_id) {
1451 return true; 1458 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1459 root->hierarchy_id = 0;
1460 }
1452} 1461}
1453 1462
1454static int cgroup_test_super(struct super_block *sb, void *data) 1463static int cgroup_test_super(struct super_block *sb, void *data)
@@ -1482,12 +1491,16 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1482 if (!root) 1491 if (!root)
1483 return ERR_PTR(-ENOMEM); 1492 return ERR_PTR(-ENOMEM);
1484 1493
1485 if (!init_root_id(root)) {
1486 kfree(root);
1487 return ERR_PTR(-ENOMEM);
1488 }
1489 init_cgroup_root(root); 1494 init_cgroup_root(root);
1490 1495
1496 /*
1497 * We need to set @root->subsys_mask now so that @root can be
1498 * matched by cgroup_test_super() before it finishes
1499 * initialization; otherwise, competing mounts with the same
1500 * options may try to bind the same subsystems instead of waiting
1501 * for the first one leading to unexpected mount errors.
1502 * SUBSYS_BOUND will be set once actual binding is complete.
1503 */
1491 root->subsys_mask = opts->subsys_mask; 1504 root->subsys_mask = opts->subsys_mask;
1492 root->flags = opts->flags; 1505 root->flags = opts->flags;
1493 ida_init(&root->cgroup_ida); 1506 ida_init(&root->cgroup_ida);
@@ -1500,17 +1513,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1500 return root; 1513 return root;
1501} 1514}
1502 1515
1503static void cgroup_drop_root(struct cgroupfs_root *root) 1516static void cgroup_free_root(struct cgroupfs_root *root)
1504{ 1517{
1505 if (!root) 1518 if (root) {
1506 return; 1519 /* hierarhcy ID shoulid already have been released */
1520 WARN_ON_ONCE(root->hierarchy_id);
1507 1521
1508 BUG_ON(!root->hierarchy_id); 1522 ida_destroy(&root->cgroup_ida);
1509 spin_lock(&hierarchy_id_lock); 1523 kfree(root);
1510 ida_remove(&hierarchy_ida, root->hierarchy_id); 1524 }
1511 spin_unlock(&hierarchy_id_lock);
1512 ida_destroy(&root->cgroup_ida);
1513 kfree(root);
1514} 1525}
1515 1526
1516static int cgroup_set_super(struct super_block *sb, void *data) 1527static int cgroup_set_super(struct super_block *sb, void *data)
@@ -1597,7 +1608,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1597 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); 1608 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
1598 if (IS_ERR(sb)) { 1609 if (IS_ERR(sb)) {
1599 ret = PTR_ERR(sb); 1610 ret = PTR_ERR(sb);
1600 cgroup_drop_root(opts.new_root); 1611 cgroup_free_root(opts.new_root);
1601 goto drop_modules; 1612 goto drop_modules;
1602 } 1613 }
1603 1614
@@ -1605,12 +1616,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1605 BUG_ON(!root); 1616 BUG_ON(!root);
1606 if (root == opts.new_root) { 1617 if (root == opts.new_root) {
1607 /* We used the new root structure, so this is a new hierarchy */ 1618 /* We used the new root structure, so this is a new hierarchy */
1608 struct list_head tmp_cg_links; 1619 struct list_head tmp_links;
1609 struct cgroup *root_cgrp = &root->top_cgroup; 1620 struct cgroup *root_cgrp = &root->top_cgroup;
1610 struct cgroupfs_root *existing_root; 1621 struct cgroupfs_root *existing_root;
1611 const struct cred *cred; 1622 const struct cred *cred;
1612 int i; 1623 int i;
1613 struct css_set *cg; 1624 struct css_set *cset;
1614 1625
1615 BUG_ON(sb->s_root != NULL); 1626 BUG_ON(sb->s_root != NULL);
1616 1627
@@ -1637,13 +1648,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1637 * that's us. The worst that can happen is that we 1648 * that's us. The worst that can happen is that we
1638 * have some link structures left over 1649 * have some link structures left over
1639 */ 1650 */
1640 ret = allocate_cg_links(css_set_count, &tmp_cg_links); 1651 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1652 if (ret)
1653 goto unlock_drop;
1654
1655 /* ID 0 is reserved for dummy root, 1 for unified hierarchy */
1656 ret = cgroup_init_root_id(root, 2, 0);
1641 if (ret) 1657 if (ret)
1642 goto unlock_drop; 1658 goto unlock_drop;
1643 1659
1644 ret = rebind_subsystems(root, root->subsys_mask); 1660 ret = rebind_subsystems(root, root->subsys_mask, 0);
1645 if (ret == -EBUSY) { 1661 if (ret == -EBUSY) {
1646 free_cg_links(&tmp_cg_links); 1662 free_cgrp_cset_links(&tmp_links);
1647 goto unlock_drop; 1663 goto unlock_drop;
1648 } 1664 }
1649 /* 1665 /*
@@ -1655,8 +1671,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1655 /* EBUSY should be the only error here */ 1671 /* EBUSY should be the only error here */
1656 BUG_ON(ret); 1672 BUG_ON(ret);
1657 1673
1658 list_add(&root->root_list, &roots); 1674 list_add(&root->root_list, &cgroup_roots);
1659 root_count++; 1675 cgroup_root_count++;
1660 1676
1661 sb->s_root->d_fsdata = root_cgrp; 1677 sb->s_root->d_fsdata = root_cgrp;
1662 root->top_cgroup.dentry = sb->s_root; 1678 root->top_cgroup.dentry = sb->s_root;
@@ -1664,11 +1680,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1664 /* Link the top cgroup in this hierarchy into all 1680 /* Link the top cgroup in this hierarchy into all
1665 * the css_set objects */ 1681 * the css_set objects */
1666 write_lock(&css_set_lock); 1682 write_lock(&css_set_lock);
1667 hash_for_each(css_set_table, i, cg, hlist) 1683 hash_for_each(css_set_table, i, cset, hlist)
1668 link_css_set(&tmp_cg_links, cg, root_cgrp); 1684 link_css_set(&tmp_links, cset, root_cgrp);
1669 write_unlock(&css_set_lock); 1685 write_unlock(&css_set_lock);
1670 1686
1671 free_cg_links(&tmp_cg_links); 1687 free_cgrp_cset_links(&tmp_links);
1672 1688
1673 BUG_ON(!list_empty(&root_cgrp->children)); 1689 BUG_ON(!list_empty(&root_cgrp->children));
1674 BUG_ON(root->number_of_cgroups != 1); 1690 BUG_ON(root->number_of_cgroups != 1);
@@ -1684,9 +1700,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1684 * We re-used an existing hierarchy - the new root (if 1700 * We re-used an existing hierarchy - the new root (if
1685 * any) is not needed 1701 * any) is not needed
1686 */ 1702 */
1687 cgroup_drop_root(opts.new_root); 1703 cgroup_free_root(opts.new_root);
1688 1704
1689 if (root->flags != opts.flags) { 1705 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1690 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1706 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1691 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); 1707 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1692 ret = -EINVAL; 1708 ret = -EINVAL;
@@ -1705,6 +1721,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1705 return dget(sb->s_root); 1721 return dget(sb->s_root);
1706 1722
1707 unlock_drop: 1723 unlock_drop:
1724 cgroup_exit_root_id(root);
1708 mutex_unlock(&cgroup_root_mutex); 1725 mutex_unlock(&cgroup_root_mutex);
1709 mutex_unlock(&cgroup_mutex); 1726 mutex_unlock(&cgroup_mutex);
1710 mutex_unlock(&inode->i_mutex); 1727 mutex_unlock(&inode->i_mutex);
@@ -1721,9 +1738,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1721static void cgroup_kill_sb(struct super_block *sb) { 1738static void cgroup_kill_sb(struct super_block *sb) {
1722 struct cgroupfs_root *root = sb->s_fs_info; 1739 struct cgroupfs_root *root = sb->s_fs_info;
1723 struct cgroup *cgrp = &root->top_cgroup; 1740 struct cgroup *cgrp = &root->top_cgroup;
1741 struct cgrp_cset_link *link, *tmp_link;
1724 int ret; 1742 int ret;
1725 struct cg_cgroup_link *link;
1726 struct cg_cgroup_link *saved_link;
1727 1743
1728 BUG_ON(!root); 1744 BUG_ON(!root);
1729 1745
@@ -1734,36 +1750,39 @@ static void cgroup_kill_sb(struct super_block *sb) {
1734 mutex_lock(&cgroup_root_mutex); 1750 mutex_lock(&cgroup_root_mutex);
1735 1751
1736 /* Rebind all subsystems back to the default hierarchy */ 1752 /* Rebind all subsystems back to the default hierarchy */
1737 ret = rebind_subsystems(root, 0); 1753 if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
1738 /* Shouldn't be able to fail ... */ 1754 ret = rebind_subsystems(root, 0, root->subsys_mask);
1739 BUG_ON(ret); 1755 /* Shouldn't be able to fail ... */
1756 BUG_ON(ret);
1757 }
1740 1758
1741 /* 1759 /*
1742 * Release all the links from css_sets to this hierarchy's 1760 * Release all the links from cset_links to this hierarchy's
1743 * root cgroup 1761 * root cgroup
1744 */ 1762 */
1745 write_lock(&css_set_lock); 1763 write_lock(&css_set_lock);
1746 1764
1747 list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, 1765 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1748 cgrp_link_list) { 1766 list_del(&link->cset_link);
1749 list_del(&link->cg_link_list); 1767 list_del(&link->cgrp_link);
1750 list_del(&link->cgrp_link_list);
1751 kfree(link); 1768 kfree(link);
1752 } 1769 }
1753 write_unlock(&css_set_lock); 1770 write_unlock(&css_set_lock);
1754 1771
1755 if (!list_empty(&root->root_list)) { 1772 if (!list_empty(&root->root_list)) {
1756 list_del(&root->root_list); 1773 list_del(&root->root_list);
1757 root_count--; 1774 cgroup_root_count--;
1758 } 1775 }
1759 1776
1777 cgroup_exit_root_id(root);
1778
1760 mutex_unlock(&cgroup_root_mutex); 1779 mutex_unlock(&cgroup_root_mutex);
1761 mutex_unlock(&cgroup_mutex); 1780 mutex_unlock(&cgroup_mutex);
1762 1781
1763 simple_xattrs_free(&cgrp->xattrs); 1782 simple_xattrs_free(&cgrp->xattrs);
1764 1783
1765 kill_litter_super(sb); 1784 kill_litter_super(sb);
1766 cgroup_drop_root(root); 1785 cgroup_free_root(root);
1767} 1786}
1768 1787
1769static struct file_system_type cgroup_fs_type = { 1788static struct file_system_type cgroup_fs_type = {
@@ -1825,6 +1844,45 @@ out:
1825} 1844}
1826EXPORT_SYMBOL_GPL(cgroup_path); 1845EXPORT_SYMBOL_GPL(cgroup_path);
1827 1846
1847/**
1848 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1849 * @task: target task
1850 * @buf: the buffer to write the path into
1851 * @buflen: the length of the buffer
1852 *
1853 * Determine @task's cgroup on the first (the one with the lowest non-zero
1854 * hierarchy_id) cgroup hierarchy and copy its path into @buf. This
1855 * function grabs cgroup_mutex and shouldn't be used inside locks used by
1856 * cgroup controller callbacks.
1857 *
1858 * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short.
1859 */
1860int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1861{
1862 struct cgroupfs_root *root;
1863 struct cgroup *cgrp;
1864 int hierarchy_id = 1, ret = 0;
1865
1866 if (buflen < 2)
1867 return -ENAMETOOLONG;
1868
1869 mutex_lock(&cgroup_mutex);
1870
1871 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1872
1873 if (root) {
1874 cgrp = task_cgroup_from_root(task, root);
1875 ret = cgroup_path(cgrp, buf, buflen);
1876 } else {
1877 /* if no hierarchy exists, everyone is in "/" */
1878 memcpy(buf, "/", 2);
1879 }
1880
1881 mutex_unlock(&cgroup_mutex);
1882 return ret;
1883}
1884EXPORT_SYMBOL_GPL(task_cgroup_path);
1885
1828/* 1886/*
1829 * Control Group taskset 1887 * Control Group taskset
1830 */ 1888 */
@@ -1910,10 +1968,11 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1910 * 1968 *
1911 * Must be called with cgroup_mutex and threadgroup locked. 1969 * Must be called with cgroup_mutex and threadgroup locked.
1912 */ 1970 */
1913static void cgroup_task_migrate(struct cgroup *oldcgrp, 1971static void cgroup_task_migrate(struct cgroup *old_cgrp,
1914 struct task_struct *tsk, struct css_set *newcg) 1972 struct task_struct *tsk,
1973 struct css_set *new_cset)
1915{ 1974{
1916 struct css_set *oldcg; 1975 struct css_set *old_cset;
1917 1976
1918 /* 1977 /*
1919 * We are synchronized through threadgroup_lock() against PF_EXITING 1978 * We are synchronized through threadgroup_lock() against PF_EXITING
@@ -1921,25 +1980,25 @@ static void cgroup_task_migrate(struct cgroup *oldcgrp,
1921 * css_set to init_css_set and dropping the old one. 1980 * css_set to init_css_set and dropping the old one.
1922 */ 1981 */
1923 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1982 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1924 oldcg = tsk->cgroups; 1983 old_cset = task_css_set(tsk);
1925 1984
1926 task_lock(tsk); 1985 task_lock(tsk);
1927 rcu_assign_pointer(tsk->cgroups, newcg); 1986 rcu_assign_pointer(tsk->cgroups, new_cset);
1928 task_unlock(tsk); 1987 task_unlock(tsk);
1929 1988
1930 /* Update the css_set linked lists if we're using them */ 1989 /* Update the css_set linked lists if we're using them */
1931 write_lock(&css_set_lock); 1990 write_lock(&css_set_lock);
1932 if (!list_empty(&tsk->cg_list)) 1991 if (!list_empty(&tsk->cg_list))
1933 list_move(&tsk->cg_list, &newcg->tasks); 1992 list_move(&tsk->cg_list, &new_cset->tasks);
1934 write_unlock(&css_set_lock); 1993 write_unlock(&css_set_lock);
1935 1994
1936 /* 1995 /*
1937 * We just gained a reference on oldcg by taking it from the task. As 1996 * We just gained a reference on old_cset by taking it from the
1938 * trading it for newcg is protected by cgroup_mutex, we're safe to drop 1997 * task. As trading it for new_cset is protected by cgroup_mutex,
1939 * it here; it will be freed under RCU. 1998 * we're safe to drop it here; it will be freed under RCU.
1940 */ 1999 */
1941 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 2000 set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
1942 put_css_set(oldcg); 2001 put_css_set(old_cset);
1943} 2002}
1944 2003
1945/** 2004/**
@@ -2029,7 +2088,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2029 /* 2088 /*
2030 * step 1: check that we can legitimately attach to the cgroup. 2089 * step 1: check that we can legitimately attach to the cgroup.
2031 */ 2090 */
2032 for_each_subsys(root, ss) { 2091 for_each_root_subsys(root, ss) {
2033 if (ss->can_attach) { 2092 if (ss->can_attach) {
2034 retval = ss->can_attach(cgrp, &tset); 2093 retval = ss->can_attach(cgrp, &tset);
2035 if (retval) { 2094 if (retval) {
@@ -2044,8 +2103,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2044 * we use find_css_set, which allocates a new one if necessary. 2103 * we use find_css_set, which allocates a new one if necessary.
2045 */ 2104 */
2046 for (i = 0; i < group_size; i++) { 2105 for (i = 0; i < group_size; i++) {
2106 struct css_set *old_cset;
2107
2047 tc = flex_array_get(group, i); 2108 tc = flex_array_get(group, i);
2048 tc->cg = find_css_set(tc->task->cgroups, cgrp); 2109 old_cset = task_css_set(tc->task);
2110 tc->cg = find_css_set(old_cset, cgrp);
2049 if (!tc->cg) { 2111 if (!tc->cg) {
2050 retval = -ENOMEM; 2112 retval = -ENOMEM;
2051 goto out_put_css_set_refs; 2113 goto out_put_css_set_refs;
@@ -2066,7 +2128,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2066 /* 2128 /*
2067 * step 4: do subsystem attach callbacks. 2129 * step 4: do subsystem attach callbacks.
2068 */ 2130 */
2069 for_each_subsys(root, ss) { 2131 for_each_root_subsys(root, ss) {
2070 if (ss->attach) 2132 if (ss->attach)
2071 ss->attach(cgrp, &tset); 2133 ss->attach(cgrp, &tset);
2072 } 2134 }
@@ -2086,7 +2148,7 @@ out_put_css_set_refs:
2086 } 2148 }
2087out_cancel_attach: 2149out_cancel_attach:
2088 if (retval) { 2150 if (retval) {
2089 for_each_subsys(root, ss) { 2151 for_each_root_subsys(root, ss) {
2090 if (ss == failed_ss) 2152 if (ss == failed_ss)
2091 break; 2153 break;
2092 if (ss->cancel_attach) 2154 if (ss->cancel_attach)
@@ -2323,7 +2385,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
2323 struct cftype *cft = __d_cft(file->f_dentry); 2385 struct cftype *cft = __d_cft(file->f_dentry);
2324 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2386 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2325 2387
2326 if (cgroup_is_removed(cgrp)) 2388 if (cgroup_is_dead(cgrp))
2327 return -ENODEV; 2389 return -ENODEV;
2328 if (cft->write) 2390 if (cft->write)
2329 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 2391 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -2368,7 +2430,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2368 struct cftype *cft = __d_cft(file->f_dentry); 2430 struct cftype *cft = __d_cft(file->f_dentry);
2369 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2431 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2370 2432
2371 if (cgroup_is_removed(cgrp)) 2433 if (cgroup_is_dead(cgrp))
2372 return -ENODEV; 2434 return -ENODEV;
2373 2435
2374 if (cft->read) 2436 if (cft->read)
@@ -2435,10 +2497,12 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
2435 cft = __d_cft(file->f_dentry); 2497 cft = __d_cft(file->f_dentry);
2436 2498
2437 if (cft->read_map || cft->read_seq_string) { 2499 if (cft->read_map || cft->read_seq_string) {
2438 struct cgroup_seqfile_state *state = 2500 struct cgroup_seqfile_state *state;
2439 kzalloc(sizeof(*state), GFP_USER); 2501
2502 state = kzalloc(sizeof(*state), GFP_USER);
2440 if (!state) 2503 if (!state)
2441 return -ENOMEM; 2504 return -ENOMEM;
2505
2442 state->cft = cft; 2506 state->cft = cft;
2443 state->cgroup = __d_cgrp(file->f_dentry->d_parent); 2507 state->cgroup = __d_cgrp(file->f_dentry->d_parent);
2444 file->f_op = &cgroup_seqfile_operations; 2508 file->f_op = &cgroup_seqfile_operations;
@@ -2486,6 +2550,13 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2486 2550
2487 cgrp = __d_cgrp(old_dentry); 2551 cgrp = __d_cgrp(old_dentry);
2488 2552
2553 /*
2554 * This isn't a proper migration and its usefulness is very
2555 * limited. Disallow if sane_behavior.
2556 */
2557 if (cgroup_sane_behavior(cgrp))
2558 return -EPERM;
2559
2489 name = cgroup_alloc_name(new_dentry); 2560 name = cgroup_alloc_name(new_dentry);
2490 if (!name) 2561 if (!name)
2491 return -ENOMEM; 2562 return -ENOMEM;
@@ -2496,7 +2567,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2496 return ret; 2567 return ret;
2497 } 2568 }
2498 2569
2499 old_name = cgrp->name; 2570 old_name = rcu_dereference_protected(cgrp->name, true);
2500 rcu_assign_pointer(cgrp->name, name); 2571 rcu_assign_pointer(cgrp->name, name);
2501 2572
2502 kfree_rcu(old_name, rcu_head); 2573 kfree_rcu(old_name, rcu_head);
@@ -2577,7 +2648,7 @@ static const struct inode_operations cgroup_file_inode_operations = {
2577}; 2648};
2578 2649
2579static const struct inode_operations cgroup_dir_inode_operations = { 2650static const struct inode_operations cgroup_dir_inode_operations = {
2580 .lookup = cgroup_lookup, 2651 .lookup = simple_lookup,
2581 .mkdir = cgroup_mkdir, 2652 .mkdir = cgroup_mkdir,
2582 .rmdir = cgroup_rmdir, 2653 .rmdir = cgroup_rmdir,
2583 .rename = cgroup_rename, 2654 .rename = cgroup_rename,
@@ -2587,14 +2658,6 @@ static const struct inode_operations cgroup_dir_inode_operations = {
2587 .removexattr = cgroup_removexattr, 2658 .removexattr = cgroup_removexattr,
2588}; 2659};
2589 2660
2590static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
2591{
2592 if (dentry->d_name.len > NAME_MAX)
2593 return ERR_PTR(-ENAMETOOLONG);
2594 d_add(dentry, NULL);
2595 return NULL;
2596}
2597
2598/* 2661/*
2599 * Check if a file is a control file 2662 * Check if a file is a control file
2600 */ 2663 */
@@ -2747,58 +2810,78 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2747 return ret; 2810 return ret;
2748} 2811}
2749 2812
2750static DEFINE_MUTEX(cgroup_cft_mutex);
2751
2752static void cgroup_cfts_prepare(void) 2813static void cgroup_cfts_prepare(void)
2753 __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) 2814 __acquires(&cgroup_mutex)
2754{ 2815{
2755 /* 2816 /*
2756 * Thanks to the entanglement with vfs inode locking, we can't walk 2817 * Thanks to the entanglement with vfs inode locking, we can't walk
2757 * the existing cgroups under cgroup_mutex and create files. 2818 * the existing cgroups under cgroup_mutex and create files.
2758 * Instead, we increment reference on all cgroups and build list of 2819 * Instead, we use cgroup_for_each_descendant_pre() and drop RCU
2759 * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure 2820 * read lock before calling cgroup_addrm_files().
2760 * exclusive access to the field.
2761 */ 2821 */
2762 mutex_lock(&cgroup_cft_mutex);
2763 mutex_lock(&cgroup_mutex); 2822 mutex_lock(&cgroup_mutex);
2764} 2823}
2765 2824
2766static void cgroup_cfts_commit(struct cgroup_subsys *ss, 2825static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2767 struct cftype *cfts, bool is_add) 2826 struct cftype *cfts, bool is_add)
2768 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) 2827 __releases(&cgroup_mutex)
2769{ 2828{
2770 LIST_HEAD(pending); 2829 LIST_HEAD(pending);
2771 struct cgroup *cgrp, *n; 2830 struct cgroup *cgrp, *root = &ss->root->top_cgroup;
2831 struct super_block *sb = ss->root->sb;
2832 struct dentry *prev = NULL;
2833 struct inode *inode;
2834 u64 update_before;
2772 2835
2773 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2836 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2774 if (cfts && ss->root != &rootnode) { 2837 if (!cfts || ss->root == &cgroup_dummy_root ||
2775 list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { 2838 !atomic_inc_not_zero(&sb->s_active)) {
2776 dget(cgrp->dentry); 2839 mutex_unlock(&cgroup_mutex);
2777 list_add_tail(&cgrp->cft_q_node, &pending); 2840 return;
2778 }
2779 } 2841 }
2780 2842
2781 mutex_unlock(&cgroup_mutex);
2782
2783 /* 2843 /*
2784 * All new cgroups will see @cfts update on @ss->cftsets. Add/rm 2844 * All cgroups which are created after we drop cgroup_mutex will
2785 * files for all cgroups which were created before. 2845 * have the updated set of files, so we only need to update the
2846 * cgroups created before the current @cgroup_serial_nr_next.
2786 */ 2847 */
2787 list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { 2848 update_before = cgroup_serial_nr_next;
2788 struct inode *inode = cgrp->dentry->d_inode; 2849
2850 mutex_unlock(&cgroup_mutex);
2851
2852 /* @root always needs to be updated */
2853 inode = root->dentry->d_inode;
2854 mutex_lock(&inode->i_mutex);
2855 mutex_lock(&cgroup_mutex);
2856 cgroup_addrm_files(root, ss, cfts, is_add);
2857 mutex_unlock(&cgroup_mutex);
2858 mutex_unlock(&inode->i_mutex);
2859
2860 /* add/rm files for all cgroups created before */
2861 rcu_read_lock();
2862 cgroup_for_each_descendant_pre(cgrp, root) {
2863 if (cgroup_is_dead(cgrp))
2864 continue;
2865
2866 inode = cgrp->dentry->d_inode;
2867 dget(cgrp->dentry);
2868 rcu_read_unlock();
2869
2870 dput(prev);
2871 prev = cgrp->dentry;
2789 2872
2790 mutex_lock(&inode->i_mutex); 2873 mutex_lock(&inode->i_mutex);
2791 mutex_lock(&cgroup_mutex); 2874 mutex_lock(&cgroup_mutex);
2792 if (!cgroup_is_removed(cgrp)) 2875 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2793 cgroup_addrm_files(cgrp, ss, cfts, is_add); 2876 cgroup_addrm_files(cgrp, ss, cfts, is_add);
2794 mutex_unlock(&cgroup_mutex); 2877 mutex_unlock(&cgroup_mutex);
2795 mutex_unlock(&inode->i_mutex); 2878 mutex_unlock(&inode->i_mutex);
2796 2879
2797 list_del_init(&cgrp->cft_q_node); 2880 rcu_read_lock();
2798 dput(cgrp->dentry);
2799 } 2881 }
2800 2882 rcu_read_unlock();
2801 mutex_unlock(&cgroup_cft_mutex); 2883 dput(prev);
2884 deactivate_super(sb);
2802} 2885}
2803 2886
2804/** 2887/**
@@ -2853,7 +2936,8 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2853 2936
2854 list_for_each_entry(set, &ss->cftsets, node) { 2937 list_for_each_entry(set, &ss->cftsets, node) {
2855 if (set->cfts == cfts) { 2938 if (set->cfts == cfts) {
2856 list_del_init(&set->node); 2939 list_del(&set->node);
2940 kfree(set);
2857 cgroup_cfts_commit(ss, cfts, false); 2941 cgroup_cfts_commit(ss, cfts, false);
2858 return 0; 2942 return 0;
2859 } 2943 }
@@ -2872,12 +2956,11 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2872int cgroup_task_count(const struct cgroup *cgrp) 2956int cgroup_task_count(const struct cgroup *cgrp)
2873{ 2957{
2874 int count = 0; 2958 int count = 0;
2875 struct cg_cgroup_link *link; 2959 struct cgrp_cset_link *link;
2876 2960
2877 read_lock(&css_set_lock); 2961 read_lock(&css_set_lock);
2878 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { 2962 list_for_each_entry(link, &cgrp->cset_links, cset_link)
2879 count += atomic_read(&link->cg->refcount); 2963 count += atomic_read(&link->cset->refcount);
2880 }
2881 read_unlock(&css_set_lock); 2964 read_unlock(&css_set_lock);
2882 return count; 2965 return count;
2883} 2966}
@@ -2886,25 +2969,24 @@ int cgroup_task_count(const struct cgroup *cgrp)
2886 * Advance a list_head iterator. The iterator should be positioned at 2969 * Advance a list_head iterator. The iterator should be positioned at
2887 * the start of a css_set 2970 * the start of a css_set
2888 */ 2971 */
2889static void cgroup_advance_iter(struct cgroup *cgrp, 2972static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it)
2890 struct cgroup_iter *it)
2891{ 2973{
2892 struct list_head *l = it->cg_link; 2974 struct list_head *l = it->cset_link;
2893 struct cg_cgroup_link *link; 2975 struct cgrp_cset_link *link;
2894 struct css_set *cg; 2976 struct css_set *cset;
2895 2977
2896 /* Advance to the next non-empty css_set */ 2978 /* Advance to the next non-empty css_set */
2897 do { 2979 do {
2898 l = l->next; 2980 l = l->next;
2899 if (l == &cgrp->css_sets) { 2981 if (l == &cgrp->cset_links) {
2900 it->cg_link = NULL; 2982 it->cset_link = NULL;
2901 return; 2983 return;
2902 } 2984 }
2903 link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); 2985 link = list_entry(l, struct cgrp_cset_link, cset_link);
2904 cg = link->cg; 2986 cset = link->cset;
2905 } while (list_empty(&cg->tasks)); 2987 } while (list_empty(&cset->tasks));
2906 it->cg_link = l; 2988 it->cset_link = l;
2907 it->task = cg->tasks.next; 2989 it->task = cset->tasks.next;
2908} 2990}
2909 2991
2910/* 2992/*
@@ -2934,7 +3016,7 @@ static void cgroup_enable_task_cg_lists(void)
2934 * entry won't be deleted though the process has exited. 3016 * entry won't be deleted though the process has exited.
2935 */ 3017 */
2936 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) 3018 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2937 list_add(&p->cg_list, &p->cgroups->tasks); 3019 list_add(&p->cg_list, &task_css_set(p)->tasks);
2938 task_unlock(p); 3020 task_unlock(p);
2939 } while_each_thread(g, p); 3021 } while_each_thread(g, p);
2940 read_unlock(&tasklist_lock); 3022 read_unlock(&tasklist_lock);
@@ -2942,12 +3024,67 @@ static void cgroup_enable_task_cg_lists(void)
2942} 3024}
2943 3025
2944/** 3026/**
3027 * cgroup_next_sibling - find the next sibling of a given cgroup
3028 * @pos: the current cgroup
3029 *
3030 * This function returns the next sibling of @pos and should be called
3031 * under RCU read lock. The only requirement is that @pos is accessible.
3032 * The next sibling is guaranteed to be returned regardless of @pos's
3033 * state.
3034 */
3035struct cgroup *cgroup_next_sibling(struct cgroup *pos)
3036{
3037 struct cgroup *next;
3038
3039 WARN_ON_ONCE(!rcu_read_lock_held());
3040
3041 /*
3042 * @pos could already have been removed. Once a cgroup is removed,
3043 * its ->sibling.next is no longer updated when its next sibling
3044 * changes. As CGRP_DEAD assertion is serialized and happens
3045 * before the cgroup is taken off the ->sibling list, if we see it
3046 * unasserted, it's guaranteed that the next sibling hasn't
3047 * finished its grace period even if it's already removed, and thus
3048 * safe to dereference from this RCU critical section. If
3049 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
3050 * to be visible as %true here.
3051 */
3052 if (likely(!cgroup_is_dead(pos))) {
3053 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3054 if (&next->sibling != &pos->parent->children)
3055 return next;
3056 return NULL;
3057 }
3058
3059 /*
3060 * Can't dereference the next pointer. Each cgroup is given a
3061 * monotonically increasing unique serial number and always
3062 * appended to the sibling list, so the next one can be found by
3063 * walking the parent's children until we see a cgroup with higher
3064 * serial number than @pos's.
3065 *
3066 * While this path can be slow, it's taken only when either the
3067 * current cgroup is removed or iteration and removal race.
3068 */
3069 list_for_each_entry_rcu(next, &pos->parent->children, sibling)
3070 if (next->serial_nr > pos->serial_nr)
3071 return next;
3072 return NULL;
3073}
3074EXPORT_SYMBOL_GPL(cgroup_next_sibling);
3075
3076/**
2945 * cgroup_next_descendant_pre - find the next descendant for pre-order walk 3077 * cgroup_next_descendant_pre - find the next descendant for pre-order walk
2946 * @pos: the current position (%NULL to initiate traversal) 3078 * @pos: the current position (%NULL to initiate traversal)
2947 * @cgroup: cgroup whose descendants to walk 3079 * @cgroup: cgroup whose descendants to walk
2948 * 3080 *
2949 * To be used by cgroup_for_each_descendant_pre(). Find the next 3081 * To be used by cgroup_for_each_descendant_pre(). Find the next
2950 * descendant to visit for pre-order traversal of @cgroup's descendants. 3082 * descendant to visit for pre-order traversal of @cgroup's descendants.
3083 *
3084 * While this function requires RCU read locking, it doesn't require the
3085 * whole traversal to be contained in a single RCU critical section. This
3086 * function will return the correct next descendant as long as both @pos
3087 * and @cgroup are accessible and @pos is a descendant of @cgroup.
2951 */ 3088 */
2952struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, 3089struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2953 struct cgroup *cgroup) 3090 struct cgroup *cgroup)
@@ -2967,11 +3104,9 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2967 3104
2968 /* no child, visit my or the closest ancestor's next sibling */ 3105 /* no child, visit my or the closest ancestor's next sibling */
2969 while (pos != cgroup) { 3106 while (pos != cgroup) {
2970 next = list_entry_rcu(pos->sibling.next, struct cgroup, 3107 next = cgroup_next_sibling(pos);
2971 sibling); 3108 if (next)
2972 if (&next->sibling != &pos->parent->children)
2973 return next; 3109 return next;
2974
2975 pos = pos->parent; 3110 pos = pos->parent;
2976 } 3111 }
2977 3112
@@ -2986,6 +3121,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
2986 * Return the rightmost descendant of @pos. If there's no descendant, 3121 * Return the rightmost descendant of @pos. If there's no descendant,
2987 * @pos is returned. This can be used during pre-order traversal to skip 3122 * @pos is returned. This can be used during pre-order traversal to skip
2988 * subtree of @pos. 3123 * subtree of @pos.
3124 *
3125 * While this function requires RCU read locking, it doesn't require the
3126 * whole traversal to be contained in a single RCU critical section. This
3127 * function will return the correct rightmost descendant as long as @pos is
3128 * accessible.
2989 */ 3129 */
2990struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) 3130struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
2991{ 3131{
@@ -3025,6 +3165,11 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
3025 * 3165 *
3026 * To be used by cgroup_for_each_descendant_post(). Find the next 3166 * To be used by cgroup_for_each_descendant_post(). Find the next
3027 * descendant to visit for post-order traversal of @cgroup's descendants. 3167 * descendant to visit for post-order traversal of @cgroup's descendants.
3168 *
3169 * While this function requires RCU read locking, it doesn't require the
3170 * whole traversal to be contained in a single RCU critical section. This
3171 * function will return the correct next descendant as long as both @pos
3172 * and @cgroup are accessible and @pos is a descendant of @cgroup.
3028 */ 3173 */
3029struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, 3174struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3030 struct cgroup *cgroup) 3175 struct cgroup *cgroup)
@@ -3040,8 +3185,8 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3040 } 3185 }
3041 3186
3042 /* if there's an unvisited sibling, visit its leftmost descendant */ 3187 /* if there's an unvisited sibling, visit its leftmost descendant */
3043 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 3188 next = cgroup_next_sibling(pos);
3044 if (&next->sibling != &pos->parent->children) 3189 if (next)
3045 return cgroup_leftmost_descendant(next); 3190 return cgroup_leftmost_descendant(next);
3046 3191
3047 /* no sibling left, visit parent */ 3192 /* no sibling left, visit parent */
@@ -3062,7 +3207,7 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
3062 cgroup_enable_task_cg_lists(); 3207 cgroup_enable_task_cg_lists();
3063 3208
3064 read_lock(&css_set_lock); 3209 read_lock(&css_set_lock);
3065 it->cg_link = &cgrp->css_sets; 3210 it->cset_link = &cgrp->cset_links;
3066 cgroup_advance_iter(cgrp, it); 3211 cgroup_advance_iter(cgrp, it);
3067} 3212}
3068 3213
@@ -3071,16 +3216,16 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
3071{ 3216{
3072 struct task_struct *res; 3217 struct task_struct *res;
3073 struct list_head *l = it->task; 3218 struct list_head *l = it->task;
3074 struct cg_cgroup_link *link; 3219 struct cgrp_cset_link *link;
3075 3220
3076 /* If the iterator cg is NULL, we have no tasks */ 3221 /* If the iterator cg is NULL, we have no tasks */
3077 if (!it->cg_link) 3222 if (!it->cset_link)
3078 return NULL; 3223 return NULL;
3079 res = list_entry(l, struct task_struct, cg_list); 3224 res = list_entry(l, struct task_struct, cg_list);
3080 /* Advance iterator to find next entry */ 3225 /* Advance iterator to find next entry */
3081 l = l->next; 3226 l = l->next;
3082 link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); 3227 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
3083 if (l == &link->cg->tasks) { 3228 if (l == &link->cset->tasks) {
3084 /* We reached the end of this task list - move on to 3229 /* We reached the end of this task list - move on to
3085 * the next cg_cgroup_link */ 3230 * the next cg_cgroup_link */
3086 cgroup_advance_iter(cgrp, it); 3231 cgroup_advance_iter(cgrp, it);
@@ -3411,7 +3556,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3411 } 3556 }
3412 } 3557 }
3413 /* entry not found; create a new one */ 3558 /* entry not found; create a new one */
3414 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 3559 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3415 if (!l) { 3560 if (!l) {
3416 mutex_unlock(&cgrp->pidlist_mutex); 3561 mutex_unlock(&cgrp->pidlist_mutex);
3417 return l; 3562 return l;
@@ -3420,8 +3565,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3420 down_write(&l->mutex); 3565 down_write(&l->mutex);
3421 l->key.type = type; 3566 l->key.type = type;
3422 l->key.ns = get_pid_ns(ns); 3567 l->key.ns = get_pid_ns(ns);
3423 l->use_count = 0; /* don't increment here */
3424 l->list = NULL;
3425 l->owner = cgrp; 3568 l->owner = cgrp;
3426 list_add(&l->links, &cgrp->pidlists); 3569 list_add(&l->links, &cgrp->pidlists);
3427 mutex_unlock(&cgrp->pidlist_mutex); 3570 mutex_unlock(&cgrp->pidlist_mutex);
@@ -3727,6 +3870,23 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
3727} 3870}
3728 3871
3729/* 3872/*
3873 * When dput() is called asynchronously, if umount has been done and
3874 * then deactivate_super() in cgroup_free_fn() kills the superblock,
3875 * there's a small window that vfs will see the root dentry with non-zero
3876 * refcnt and trigger BUG().
3877 *
3878 * That's why we hold a reference before dput() and drop it right after.
3879 */
3880static void cgroup_dput(struct cgroup *cgrp)
3881{
3882 struct super_block *sb = cgrp->root->sb;
3883
3884 atomic_inc(&sb->s_active);
3885 dput(cgrp->dentry);
3886 deactivate_super(sb);
3887}
3888
3889/*
3730 * Unregister event and free resources. 3890 * Unregister event and free resources.
3731 * 3891 *
3732 * Gets called from workqueue. 3892 * Gets called from workqueue.
@@ -3746,7 +3906,7 @@ static void cgroup_event_remove(struct work_struct *work)
3746 3906
3747 eventfd_ctx_put(event->eventfd); 3907 eventfd_ctx_put(event->eventfd);
3748 kfree(event); 3908 kfree(event);
3749 dput(cgrp->dentry); 3909 cgroup_dput(cgrp);
3750} 3910}
3751 3911
3752/* 3912/*
@@ -3933,33 +4093,16 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
3933 return 0; 4093 return 0;
3934} 4094}
3935 4095
3936/* 4096static struct cftype cgroup_base_files[] = {
3937 * for the common functions, 'private' gives the type of file
3938 */
3939/* for hysterical raisins, we can't put this on the older files */
3940#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
3941static struct cftype files[] = {
3942 { 4097 {
3943 .name = "tasks", 4098 .name = "cgroup.procs",
3944 .open = cgroup_tasks_open,
3945 .write_u64 = cgroup_tasks_write,
3946 .release = cgroup_pidlist_release,
3947 .mode = S_IRUGO | S_IWUSR,
3948 },
3949 {
3950 .name = CGROUP_FILE_GENERIC_PREFIX "procs",
3951 .open = cgroup_procs_open, 4099 .open = cgroup_procs_open,
3952 .write_u64 = cgroup_procs_write, 4100 .write_u64 = cgroup_procs_write,
3953 .release = cgroup_pidlist_release, 4101 .release = cgroup_pidlist_release,
3954 .mode = S_IRUGO | S_IWUSR, 4102 .mode = S_IRUGO | S_IWUSR,
3955 }, 4103 },
3956 { 4104 {
3957 .name = "notify_on_release", 4105 .name = "cgroup.event_control",
3958 .read_u64 = cgroup_read_notify_on_release,
3959 .write_u64 = cgroup_write_notify_on_release,
3960 },
3961 {
3962 .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3963 .write_string = cgroup_write_event_control, 4106 .write_string = cgroup_write_event_control,
3964 .mode = S_IWUGO, 4107 .mode = S_IWUGO,
3965 }, 4108 },
@@ -3974,9 +4117,29 @@ static struct cftype files[] = {
3974 .flags = CFTYPE_ONLY_ON_ROOT, 4117 .flags = CFTYPE_ONLY_ON_ROOT,
3975 .read_seq_string = cgroup_sane_behavior_show, 4118 .read_seq_string = cgroup_sane_behavior_show,
3976 }, 4119 },
4120
4121 /*
4122 * Historical crazy stuff. These don't have "cgroup." prefix and
4123 * don't exist if sane_behavior. If you're depending on these, be
4124 * prepared to be burned.
4125 */
4126 {
4127 .name = "tasks",
4128 .flags = CFTYPE_INSANE, /* use "procs" instead */
4129 .open = cgroup_tasks_open,
4130 .write_u64 = cgroup_tasks_write,
4131 .release = cgroup_pidlist_release,
4132 .mode = S_IRUGO | S_IWUSR,
4133 },
4134 {
4135 .name = "notify_on_release",
4136 .flags = CFTYPE_INSANE,
4137 .read_u64 = cgroup_read_notify_on_release,
4138 .write_u64 = cgroup_write_notify_on_release,
4139 },
3977 { 4140 {
3978 .name = "release_agent", 4141 .name = "release_agent",
3979 .flags = CFTYPE_ONLY_ON_ROOT, 4142 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3980 .read_seq_string = cgroup_release_agent_show, 4143 .read_seq_string = cgroup_release_agent_show,
3981 .write_string = cgroup_release_agent_write, 4144 .write_string = cgroup_release_agent_write,
3982 .max_write_len = PATH_MAX, 4145 .max_write_len = PATH_MAX,
@@ -3997,13 +4160,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
3997 struct cgroup_subsys *ss; 4160 struct cgroup_subsys *ss;
3998 4161
3999 if (base_files) { 4162 if (base_files) {
4000 err = cgroup_addrm_files(cgrp, NULL, files, true); 4163 err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true);
4001 if (err < 0) 4164 if (err < 0)
4002 return err; 4165 return err;
4003 } 4166 }
4004 4167
4005 /* process cftsets of each subsystem */ 4168 /* process cftsets of each subsystem */
4006 for_each_subsys(cgrp->root, ss) { 4169 for_each_root_subsys(cgrp->root, ss) {
4007 struct cftype_set *set; 4170 struct cftype_set *set;
4008 if (!test_bit(ss->subsys_id, &subsys_mask)) 4171 if (!test_bit(ss->subsys_id, &subsys_mask))
4009 continue; 4172 continue;
@@ -4013,15 +4176,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
4013 } 4176 }
4014 4177
4015 /* This cgroup is ready now */ 4178 /* This cgroup is ready now */
4016 for_each_subsys(cgrp->root, ss) { 4179 for_each_root_subsys(cgrp->root, ss) {
4017 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4180 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4181 struct css_id *id = rcu_dereference_protected(css->id, true);
4182
4018 /* 4183 /*
4019 * Update id->css pointer and make this css visible from 4184 * Update id->css pointer and make this css visible from
4020 * CSS ID functions. This pointer will be dereferened 4185 * CSS ID functions. This pointer will be dereferened
4021 * from RCU-read-side without locks. 4186 * from RCU-read-side without locks.
4022 */ 4187 */
4023 if (css->id) 4188 if (id)
4024 rcu_assign_pointer(css->id->css, css); 4189 rcu_assign_pointer(id->css, css);
4025 } 4190 }
4026 4191
4027 return 0; 4192 return 0;
@@ -4031,12 +4196,16 @@ static void css_dput_fn(struct work_struct *work)
4031{ 4196{
4032 struct cgroup_subsys_state *css = 4197 struct cgroup_subsys_state *css =
4033 container_of(work, struct cgroup_subsys_state, dput_work); 4198 container_of(work, struct cgroup_subsys_state, dput_work);
4034 struct dentry *dentry = css->cgroup->dentry;
4035 struct super_block *sb = dentry->d_sb;
4036 4199
4037 atomic_inc(&sb->s_active); 4200 cgroup_dput(css->cgroup);
4038 dput(dentry); 4201}
4039 deactivate_super(sb); 4202
4203static void css_release(struct percpu_ref *ref)
4204{
4205 struct cgroup_subsys_state *css =
4206 container_of(ref, struct cgroup_subsys_state, refcnt);
4207
4208 schedule_work(&css->dput_work);
4040} 4209}
4041 4210
4042static void init_cgroup_css(struct cgroup_subsys_state *css, 4211static void init_cgroup_css(struct cgroup_subsys_state *css,
@@ -4044,10 +4213,9 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
4044 struct cgroup *cgrp) 4213 struct cgroup *cgrp)
4045{ 4214{
4046 css->cgroup = cgrp; 4215 css->cgroup = cgrp;
4047 atomic_set(&css->refcnt, 1);
4048 css->flags = 0; 4216 css->flags = 0;
4049 css->id = NULL; 4217 css->id = NULL;
4050 if (cgrp == dummytop) 4218 if (cgrp == cgroup_dummy_top)
4051 css->flags |= CSS_ROOT; 4219 css->flags |= CSS_ROOT;
4052 BUG_ON(cgrp->subsys[ss->subsys_id]); 4220 BUG_ON(cgrp->subsys[ss->subsys_id]);
4053 cgrp->subsys[ss->subsys_id] = css; 4221 cgrp->subsys[ss->subsys_id] = css;
@@ -4157,7 +4325,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4157 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 4325 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4158 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4326 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4159 4327
4160 for_each_subsys(root, ss) { 4328 for_each_root_subsys(root, ss) {
4161 struct cgroup_subsys_state *css; 4329 struct cgroup_subsys_state *css;
4162 4330
4163 css = ss->css_alloc(cgrp); 4331 css = ss->css_alloc(cgrp);
@@ -4165,7 +4333,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4165 err = PTR_ERR(css); 4333 err = PTR_ERR(css);
4166 goto err_free_all; 4334 goto err_free_all;
4167 } 4335 }
4336
4337 err = percpu_ref_init(&css->refcnt, css_release);
4338 if (err) {
4339 ss->css_free(cgrp);
4340 goto err_free_all;
4341 }
4342
4168 init_cgroup_css(css, ss, cgrp); 4343 init_cgroup_css(css, ss, cgrp);
4344
4169 if (ss->use_id) { 4345 if (ss->use_id) {
4170 err = alloc_css_id(ss, parent, cgrp); 4346 err = alloc_css_id(ss, parent, cgrp);
4171 if (err) 4347 if (err)
@@ -4183,20 +4359,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4183 goto err_free_all; 4359 goto err_free_all;
4184 lockdep_assert_held(&dentry->d_inode->i_mutex); 4360 lockdep_assert_held(&dentry->d_inode->i_mutex);
4185 4361
4362 cgrp->serial_nr = cgroup_serial_nr_next++;
4363
4186 /* allocation complete, commit to creation */ 4364 /* allocation complete, commit to creation */
4187 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4188 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4365 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4189 root->number_of_cgroups++; 4366 root->number_of_cgroups++;
4190 4367
4191 /* each css holds a ref to the cgroup's dentry */ 4368 /* each css holds a ref to the cgroup's dentry */
4192 for_each_subsys(root, ss) 4369 for_each_root_subsys(root, ss)
4193 dget(dentry); 4370 dget(dentry);
4194 4371
4195 /* hold a ref to the parent's dentry */ 4372 /* hold a ref to the parent's dentry */
4196 dget(parent->dentry); 4373 dget(parent->dentry);
4197 4374
4198 /* creation succeeded, notify subsystems */ 4375 /* creation succeeded, notify subsystems */
4199 for_each_subsys(root, ss) { 4376 for_each_root_subsys(root, ss) {
4200 err = online_css(ss, cgrp); 4377 err = online_css(ss, cgrp);
4201 if (err) 4378 if (err)
4202 goto err_destroy; 4379 goto err_destroy;
@@ -4221,9 +4398,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4221 return 0; 4398 return 0;
4222 4399
4223err_free_all: 4400err_free_all:
4224 for_each_subsys(root, ss) { 4401 for_each_root_subsys(root, ss) {
4225 if (cgrp->subsys[ss->subsys_id]) 4402 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4403
4404 if (css) {
4405 percpu_ref_cancel_init(&css->refcnt);
4226 ss->css_free(cgrp); 4406 ss->css_free(cgrp);
4407 }
4227 } 4408 }
4228 mutex_unlock(&cgroup_mutex); 4409 mutex_unlock(&cgroup_mutex);
4229 /* Release the reference count that we took on the superblock */ 4410 /* Release the reference count that we took on the superblock */
@@ -4251,63 +4432,120 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4251 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4432 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4252} 4433}
4253 4434
4435static void cgroup_css_killed(struct cgroup *cgrp)
4436{
4437 if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
4438 return;
4439
4440 /* percpu ref's of all css's are killed, kick off the next step */
4441 INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
4442 schedule_work(&cgrp->destroy_work);
4443}
4444
4445static void css_ref_killed_fn(struct percpu_ref *ref)
4446{
4447 struct cgroup_subsys_state *css =
4448 container_of(ref, struct cgroup_subsys_state, refcnt);
4449
4450 cgroup_css_killed(css->cgroup);
4451}
4452
4453/**
4454 * cgroup_destroy_locked - the first stage of cgroup destruction
4455 * @cgrp: cgroup to be destroyed
4456 *
4457 * css's make use of percpu refcnts whose killing latency shouldn't be
4458 * exposed to userland and are RCU protected. Also, cgroup core needs to
4459 * guarantee that css_tryget() won't succeed by the time ->css_offline() is
4460 * invoked. To satisfy all the requirements, destruction is implemented in
4461 * the following two steps.
4462 *
4463 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
4464 * userland visible parts and start killing the percpu refcnts of
4465 * css's. Set up so that the next stage will be kicked off once all
4466 * the percpu refcnts are confirmed to be killed.
4467 *
4468 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
4469 * rest of destruction. Once all cgroup references are gone, the
4470 * cgroup is RCU-freed.
4471 *
4472 * This function implements s1. After this step, @cgrp is gone as far as
4473 * the userland is concerned and a new cgroup with the same name may be
4474 * created. As cgroup doesn't care about the names internally, this
4475 * doesn't cause any problem.
4476 */
4254static int cgroup_destroy_locked(struct cgroup *cgrp) 4477static int cgroup_destroy_locked(struct cgroup *cgrp)
4255 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4478 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4256{ 4479{
4257 struct dentry *d = cgrp->dentry; 4480 struct dentry *d = cgrp->dentry;
4258 struct cgroup *parent = cgrp->parent;
4259 struct cgroup_event *event, *tmp; 4481 struct cgroup_event *event, *tmp;
4260 struct cgroup_subsys *ss; 4482 struct cgroup_subsys *ss;
4483 bool empty;
4261 4484
4262 lockdep_assert_held(&d->d_inode->i_mutex); 4485 lockdep_assert_held(&d->d_inode->i_mutex);
4263 lockdep_assert_held(&cgroup_mutex); 4486 lockdep_assert_held(&cgroup_mutex);
4264 4487
4265 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) 4488 /*
4489 * css_set_lock synchronizes access to ->cset_links and prevents
4490 * @cgrp from being removed while __put_css_set() is in progress.
4491 */
4492 read_lock(&css_set_lock);
4493 empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children);
4494 read_unlock(&css_set_lock);
4495 if (!empty)
4266 return -EBUSY; 4496 return -EBUSY;
4267 4497
4268 /* 4498 /*
4269 * Block new css_tryget() by deactivating refcnt and mark @cgrp 4499 * Block new css_tryget() by killing css refcnts. cgroup core
4270 * removed. This makes future css_tryget() and child creation 4500 * guarantees that, by the time ->css_offline() is invoked, no new
4271 * attempts fail thus maintaining the removal conditions verified 4501 * css reference will be given out via css_tryget(). We can't
4272 * above. 4502 * simply call percpu_ref_kill() and proceed to offlining css's
4503 * because percpu_ref_kill() doesn't guarantee that the ref is seen
4504 * as killed on all CPUs on return.
4505 *
4506 * Use percpu_ref_kill_and_confirm() to get notifications as each
4507 * css is confirmed to be seen as killed on all CPUs. The
4508 * notification callback keeps track of the number of css's to be
4509 * killed and schedules cgroup_offline_fn() to perform the rest of
4510 * destruction once the percpu refs of all css's are confirmed to
4511 * be killed.
4273 */ 4512 */
4274 for_each_subsys(cgrp->root, ss) { 4513 atomic_set(&cgrp->css_kill_cnt, 1);
4514 for_each_root_subsys(cgrp->root, ss) {
4275 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4515 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4276 4516
4277 WARN_ON(atomic_read(&css->refcnt) < 0); 4517 /*
4278 atomic_add(CSS_DEACT_BIAS, &css->refcnt); 4518 * Killing would put the base ref, but we need to keep it
4279 } 4519 * alive until after ->css_offline.
4280 set_bit(CGRP_REMOVED, &cgrp->flags); 4520 */
4521 percpu_ref_get(&css->refcnt);
4281 4522
4282 /* tell subsystems to initate destruction */ 4523 atomic_inc(&cgrp->css_kill_cnt);
4283 for_each_subsys(cgrp->root, ss) 4524 percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
4284 offline_css(ss, cgrp); 4525 }
4526 cgroup_css_killed(cgrp);
4285 4527
4286 /* 4528 /*
4287 * Put all the base refs. Each css holds an extra reference to the 4529 * Mark @cgrp dead. This prevents further task migration and child
4288 * cgroup's dentry and cgroup removal proceeds regardless of css 4530 * creation by disabling cgroup_lock_live_group(). Note that
4289 * refs. On the last put of each css, whenever that may be, the 4531 * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to
4290 * extra dentry ref is put so that dentry destruction happens only 4532 * resume iteration after dropping RCU read lock. See
4291 * after all css's are released. 4533 * cgroup_next_sibling() for details.
4292 */ 4534 */
4293 for_each_subsys(cgrp->root, ss) 4535 set_bit(CGRP_DEAD, &cgrp->flags);
4294 css_put(cgrp->subsys[ss->subsys_id]);
4295 4536
4537 /* CGRP_DEAD is set, remove from ->release_list for the last time */
4296 raw_spin_lock(&release_list_lock); 4538 raw_spin_lock(&release_list_lock);
4297 if (!list_empty(&cgrp->release_list)) 4539 if (!list_empty(&cgrp->release_list))
4298 list_del_init(&cgrp->release_list); 4540 list_del_init(&cgrp->release_list);
4299 raw_spin_unlock(&release_list_lock); 4541 raw_spin_unlock(&release_list_lock);
4300 4542
4301 /* delete this cgroup from parent->children */ 4543 /*
4302 list_del_rcu(&cgrp->sibling); 4544 * Remove @cgrp directory. The removal puts the base ref but we
4303 list_del_init(&cgrp->allcg_node); 4545 * aren't quite done with @cgrp yet, so hold onto it.
4304 4546 */
4305 dget(d); 4547 dget(d);
4306 cgroup_d_remove_dir(d); 4548 cgroup_d_remove_dir(d);
4307 dput(d);
4308
4309 set_bit(CGRP_RELEASABLE, &parent->flags);
4310 check_for_release(parent);
4311 4549
4312 /* 4550 /*
4313 * Unregister events and notify userspace. 4551 * Unregister events and notify userspace.
@@ -4322,6 +4560,53 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4322 spin_unlock(&cgrp->event_list_lock); 4560 spin_unlock(&cgrp->event_list_lock);
4323 4561
4324 return 0; 4562 return 0;
4563};
4564
4565/**
4566 * cgroup_offline_fn - the second step of cgroup destruction
4567 * @work: cgroup->destroy_free_work
4568 *
4569 * This function is invoked from a work item for a cgroup which is being
4570 * destroyed after the percpu refcnts of all css's are guaranteed to be
4571 * seen as killed on all CPUs, and performs the rest of destruction. This
4572 * is the second step of destruction described in the comment above
4573 * cgroup_destroy_locked().
4574 */
4575static void cgroup_offline_fn(struct work_struct *work)
4576{
4577 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
4578 struct cgroup *parent = cgrp->parent;
4579 struct dentry *d = cgrp->dentry;
4580 struct cgroup_subsys *ss;
4581
4582 mutex_lock(&cgroup_mutex);
4583
4584 /*
4585 * css_tryget() is guaranteed to fail now. Tell subsystems to
4586 * initate destruction.
4587 */
4588 for_each_root_subsys(cgrp->root, ss)
4589 offline_css(ss, cgrp);
4590
4591 /*
4592 * Put the css refs from cgroup_destroy_locked(). Each css holds
4593 * an extra reference to the cgroup's dentry and cgroup removal
4594 * proceeds regardless of css refs. On the last put of each css,
4595 * whenever that may be, the extra dentry ref is put so that dentry
4596 * destruction happens only after all css's are released.
4597 */
4598 for_each_root_subsys(cgrp->root, ss)
4599 css_put(cgrp->subsys[ss->subsys_id]);
4600
4601 /* delete this cgroup from parent->children */
4602 list_del_rcu(&cgrp->sibling);
4603
4604 dput(d);
4605
4606 set_bit(CGRP_RELEASABLE, &parent->flags);
4607 check_for_release(parent);
4608
4609 mutex_unlock(&cgroup_mutex);
4325} 4610}
4326 4611
4327static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4612static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -4361,12 +4646,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4361 cgroup_init_cftsets(ss); 4646 cgroup_init_cftsets(ss);
4362 4647
4363 /* Create the top cgroup state for this subsystem */ 4648 /* Create the top cgroup state for this subsystem */
4364 list_add(&ss->sibling, &rootnode.subsys_list); 4649 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4365 ss->root = &rootnode; 4650 ss->root = &cgroup_dummy_root;
4366 css = ss->css_alloc(dummytop); 4651 css = ss->css_alloc(cgroup_dummy_top);
4367 /* We don't handle early failures gracefully */ 4652 /* We don't handle early failures gracefully */
4368 BUG_ON(IS_ERR(css)); 4653 BUG_ON(IS_ERR(css));
4369 init_cgroup_css(css, ss, dummytop); 4654 init_cgroup_css(css, ss, cgroup_dummy_top);
4370 4655
4371 /* Update the init_css_set to contain a subsys 4656 /* Update the init_css_set to contain a subsys
4372 * pointer to this state - since the subsystem is 4657 * pointer to this state - since the subsystem is
@@ -4381,7 +4666,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4381 * need to invoke fork callbacks here. */ 4666 * need to invoke fork callbacks here. */
4382 BUG_ON(!list_empty(&init_task.tasks)); 4667 BUG_ON(!list_empty(&init_task.tasks));
4383 4668
4384 BUG_ON(online_css(ss, dummytop)); 4669 BUG_ON(online_css(ss, cgroup_dummy_top));
4385 4670
4386 mutex_unlock(&cgroup_mutex); 4671 mutex_unlock(&cgroup_mutex);
4387 4672
@@ -4404,7 +4689,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4404 struct cgroup_subsys_state *css; 4689 struct cgroup_subsys_state *css;
4405 int i, ret; 4690 int i, ret;
4406 struct hlist_node *tmp; 4691 struct hlist_node *tmp;
4407 struct css_set *cg; 4692 struct css_set *cset;
4408 unsigned long key; 4693 unsigned long key;
4409 4694
4410 /* check name and function validity */ 4695 /* check name and function validity */
@@ -4427,7 +4712,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4427 */ 4712 */
4428 if (ss->module == NULL) { 4713 if (ss->module == NULL) {
4429 /* a sanity check */ 4714 /* a sanity check */
4430 BUG_ON(subsys[ss->subsys_id] != ss); 4715 BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
4431 return 0; 4716 return 0;
4432 } 4717 }
4433 4718
@@ -4435,26 +4720,26 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4435 cgroup_init_cftsets(ss); 4720 cgroup_init_cftsets(ss);
4436 4721
4437 mutex_lock(&cgroup_mutex); 4722 mutex_lock(&cgroup_mutex);
4438 subsys[ss->subsys_id] = ss; 4723 cgroup_subsys[ss->subsys_id] = ss;
4439 4724
4440 /* 4725 /*
4441 * no ss->css_alloc seems to need anything important in the ss 4726 * no ss->css_alloc seems to need anything important in the ss
4442 * struct, so this can happen first (i.e. before the rootnode 4727 * struct, so this can happen first (i.e. before the dummy root
4443 * attachment). 4728 * attachment).
4444 */ 4729 */
4445 css = ss->css_alloc(dummytop); 4730 css = ss->css_alloc(cgroup_dummy_top);
4446 if (IS_ERR(css)) { 4731 if (IS_ERR(css)) {
4447 /* failure case - need to deassign the subsys[] slot. */ 4732 /* failure case - need to deassign the cgroup_subsys[] slot. */
4448 subsys[ss->subsys_id] = NULL; 4733 cgroup_subsys[ss->subsys_id] = NULL;
4449 mutex_unlock(&cgroup_mutex); 4734 mutex_unlock(&cgroup_mutex);
4450 return PTR_ERR(css); 4735 return PTR_ERR(css);
4451 } 4736 }
4452 4737
4453 list_add(&ss->sibling, &rootnode.subsys_list); 4738 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4454 ss->root = &rootnode; 4739 ss->root = &cgroup_dummy_root;
4455 4740
4456 /* our new subsystem will be attached to the dummy hierarchy. */ 4741 /* our new subsystem will be attached to the dummy hierarchy. */
4457 init_cgroup_css(css, ss, dummytop); 4742 init_cgroup_css(css, ss, cgroup_dummy_top);
4458 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4743 /* init_idr must be after init_cgroup_css because it sets css->id. */
4459 if (ss->use_id) { 4744 if (ss->use_id) {
4460 ret = cgroup_init_idr(ss, css); 4745 ret = cgroup_init_idr(ss, css);
@@ -4471,21 +4756,21 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4471 * this is all done under the css_set_lock. 4756 * this is all done under the css_set_lock.
4472 */ 4757 */
4473 write_lock(&css_set_lock); 4758 write_lock(&css_set_lock);
4474 hash_for_each_safe(css_set_table, i, tmp, cg, hlist) { 4759 hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
4475 /* skip entries that we already rehashed */ 4760 /* skip entries that we already rehashed */
4476 if (cg->subsys[ss->subsys_id]) 4761 if (cset->subsys[ss->subsys_id])
4477 continue; 4762 continue;
4478 /* remove existing entry */ 4763 /* remove existing entry */
4479 hash_del(&cg->hlist); 4764 hash_del(&cset->hlist);
4480 /* set new value */ 4765 /* set new value */
4481 cg->subsys[ss->subsys_id] = css; 4766 cset->subsys[ss->subsys_id] = css;
4482 /* recompute hash and restore entry */ 4767 /* recompute hash and restore entry */
4483 key = css_set_hash(cg->subsys); 4768 key = css_set_hash(cset->subsys);
4484 hash_add(css_set_table, &cg->hlist, key); 4769 hash_add(css_set_table, &cset->hlist, key);
4485 } 4770 }
4486 write_unlock(&css_set_lock); 4771 write_unlock(&css_set_lock);
4487 4772
4488 ret = online_css(ss, dummytop); 4773 ret = online_css(ss, cgroup_dummy_top);
4489 if (ret) 4774 if (ret)
4490 goto err_unload; 4775 goto err_unload;
4491 4776
@@ -4511,7 +4796,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4511 */ 4796 */
4512void cgroup_unload_subsys(struct cgroup_subsys *ss) 4797void cgroup_unload_subsys(struct cgroup_subsys *ss)
4513{ 4798{
4514 struct cg_cgroup_link *link; 4799 struct cgrp_cset_link *link;
4515 4800
4516 BUG_ON(ss->module == NULL); 4801 BUG_ON(ss->module == NULL);
4517 4802
@@ -4520,45 +4805,46 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4520 * try_module_get in parse_cgroupfs_options should ensure that it 4805 * try_module_get in parse_cgroupfs_options should ensure that it
4521 * doesn't start being used while we're killing it off. 4806 * doesn't start being used while we're killing it off.
4522 */ 4807 */
4523 BUG_ON(ss->root != &rootnode); 4808 BUG_ON(ss->root != &cgroup_dummy_root);
4524 4809
4525 mutex_lock(&cgroup_mutex); 4810 mutex_lock(&cgroup_mutex);
4526 4811
4527 offline_css(ss, dummytop); 4812 offline_css(ss, cgroup_dummy_top);
4528 4813
4529 if (ss->use_id) 4814 if (ss->use_id)
4530 idr_destroy(&ss->idr); 4815 idr_destroy(&ss->idr);
4531 4816
4532 /* deassign the subsys_id */ 4817 /* deassign the subsys_id */
4533 subsys[ss->subsys_id] = NULL; 4818 cgroup_subsys[ss->subsys_id] = NULL;
4534 4819
4535 /* remove subsystem from rootnode's list of subsystems */ 4820 /* remove subsystem from the dummy root's list of subsystems */
4536 list_del_init(&ss->sibling); 4821 list_del_init(&ss->sibling);
4537 4822
4538 /* 4823 /*
4539 * disentangle the css from all css_sets attached to the dummytop. as 4824 * disentangle the css from all css_sets attached to the dummy
4540 * in loading, we need to pay our respects to the hashtable gods. 4825 * top. as in loading, we need to pay our respects to the hashtable
4826 * gods.
4541 */ 4827 */
4542 write_lock(&css_set_lock); 4828 write_lock(&css_set_lock);
4543 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { 4829 list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
4544 struct css_set *cg = link->cg; 4830 struct css_set *cset = link->cset;
4545 unsigned long key; 4831 unsigned long key;
4546 4832
4547 hash_del(&cg->hlist); 4833 hash_del(&cset->hlist);
4548 cg->subsys[ss->subsys_id] = NULL; 4834 cset->subsys[ss->subsys_id] = NULL;
4549 key = css_set_hash(cg->subsys); 4835 key = css_set_hash(cset->subsys);
4550 hash_add(css_set_table, &cg->hlist, key); 4836 hash_add(css_set_table, &cset->hlist, key);
4551 } 4837 }
4552 write_unlock(&css_set_lock); 4838 write_unlock(&css_set_lock);
4553 4839
4554 /* 4840 /*
4555 * remove subsystem's css from the dummytop and free it - need to 4841 * remove subsystem's css from the cgroup_dummy_top and free it -
4556 * free before marking as null because ss->css_free needs the 4842 * need to free before marking as null because ss->css_free needs
4557 * cgrp->subsys pointer to find their state. note that this also 4843 * the cgrp->subsys pointer to find their state. note that this
4558 * takes care of freeing the css_id. 4844 * also takes care of freeing the css_id.
4559 */ 4845 */
4560 ss->css_free(dummytop); 4846 ss->css_free(cgroup_dummy_top);
4561 dummytop->subsys[ss->subsys_id] = NULL; 4847 cgroup_dummy_top->subsys[ss->subsys_id] = NULL;
4562 4848
4563 mutex_unlock(&cgroup_mutex); 4849 mutex_unlock(&cgroup_mutex);
4564} 4850}
@@ -4572,30 +4858,25 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4572 */ 4858 */
4573int __init cgroup_init_early(void) 4859int __init cgroup_init_early(void)
4574{ 4860{
4861 struct cgroup_subsys *ss;
4575 int i; 4862 int i;
4863
4576 atomic_set(&init_css_set.refcount, 1); 4864 atomic_set(&init_css_set.refcount, 1);
4577 INIT_LIST_HEAD(&init_css_set.cg_links); 4865 INIT_LIST_HEAD(&init_css_set.cgrp_links);
4578 INIT_LIST_HEAD(&init_css_set.tasks); 4866 INIT_LIST_HEAD(&init_css_set.tasks);
4579 INIT_HLIST_NODE(&init_css_set.hlist); 4867 INIT_HLIST_NODE(&init_css_set.hlist);
4580 css_set_count = 1; 4868 css_set_count = 1;
4581 init_cgroup_root(&rootnode); 4869 init_cgroup_root(&cgroup_dummy_root);
4582 root_count = 1; 4870 cgroup_root_count = 1;
4583 init_task.cgroups = &init_css_set; 4871 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4584
4585 init_css_set_link.cg = &init_css_set;
4586 init_css_set_link.cgrp = dummytop;
4587 list_add(&init_css_set_link.cgrp_link_list,
4588 &rootnode.top_cgroup.css_sets);
4589 list_add(&init_css_set_link.cg_link_list,
4590 &init_css_set.cg_links);
4591
4592 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4593 struct cgroup_subsys *ss = subsys[i];
4594
4595 /* at bootup time, we don't worry about modular subsystems */
4596 if (!ss || ss->module)
4597 continue;
4598 4872
4873 init_cgrp_cset_link.cset = &init_css_set;
4874 init_cgrp_cset_link.cgrp = cgroup_dummy_top;
4875 list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links);
4876 list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
4877
4878 /* at bootup time, we don't worry about modular subsystems */
4879 for_each_builtin_subsys(ss, i) {
4599 BUG_ON(!ss->name); 4880 BUG_ON(!ss->name);
4600 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4881 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4601 BUG_ON(!ss->css_alloc); 4882 BUG_ON(!ss->css_alloc);
@@ -4620,30 +4901,33 @@ int __init cgroup_init_early(void)
4620 */ 4901 */
4621int __init cgroup_init(void) 4902int __init cgroup_init(void)
4622{ 4903{
4623 int err; 4904 struct cgroup_subsys *ss;
4624 int i;
4625 unsigned long key; 4905 unsigned long key;
4906 int i, err;
4626 4907
4627 err = bdi_init(&cgroup_backing_dev_info); 4908 err = bdi_init(&cgroup_backing_dev_info);
4628 if (err) 4909 if (err)
4629 return err; 4910 return err;
4630 4911
4631 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4912 for_each_builtin_subsys(ss, i) {
4632 struct cgroup_subsys *ss = subsys[i];
4633
4634 /* at bootup time, we don't worry about modular subsystems */
4635 if (!ss || ss->module)
4636 continue;
4637 if (!ss->early_init) 4913 if (!ss->early_init)
4638 cgroup_init_subsys(ss); 4914 cgroup_init_subsys(ss);
4639 if (ss->use_id) 4915 if (ss->use_id)
4640 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); 4916 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
4641 } 4917 }
4642 4918
4919 /* allocate id for the dummy hierarchy */
4920 mutex_lock(&cgroup_mutex);
4921 mutex_lock(&cgroup_root_mutex);
4922
4643 /* Add init_css_set to the hash table */ 4923 /* Add init_css_set to the hash table */
4644 key = css_set_hash(init_css_set.subsys); 4924 key = css_set_hash(init_css_set.subsys);
4645 hash_add(css_set_table, &init_css_set.hlist, key); 4925 hash_add(css_set_table, &init_css_set.hlist, key);
4646 BUG_ON(!init_root_id(&rootnode)); 4926
4927 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
4928
4929 mutex_unlock(&cgroup_root_mutex);
4930 mutex_unlock(&cgroup_mutex);
4647 4931
4648 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4932 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4649 if (!cgroup_kobj) { 4933 if (!cgroup_kobj) {
@@ -4708,7 +4992,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4708 int count = 0; 4992 int count = 0;
4709 4993
4710 seq_printf(m, "%d:", root->hierarchy_id); 4994 seq_printf(m, "%d:", root->hierarchy_id);
4711 for_each_subsys(root, ss) 4995 for_each_root_subsys(root, ss)
4712 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4996 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4713 if (strlen(root->name)) 4997 if (strlen(root->name))
4714 seq_printf(m, "%sname=%s", count ? "," : "", 4998 seq_printf(m, "%sname=%s", count ? "," : "",
@@ -4734,6 +5018,7 @@ out:
4734/* Display information about each subsystem and each hierarchy */ 5018/* Display information about each subsystem and each hierarchy */
4735static int proc_cgroupstats_show(struct seq_file *m, void *v) 5019static int proc_cgroupstats_show(struct seq_file *m, void *v)
4736{ 5020{
5021 struct cgroup_subsys *ss;
4737 int i; 5022 int i;
4738 5023
4739 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 5024 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
@@ -4743,14 +5028,12 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
4743 * subsys/hierarchy state. 5028 * subsys/hierarchy state.
4744 */ 5029 */
4745 mutex_lock(&cgroup_mutex); 5030 mutex_lock(&cgroup_mutex);
4746 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 5031
4747 struct cgroup_subsys *ss = subsys[i]; 5032 for_each_subsys(ss, i)
4748 if (ss == NULL)
4749 continue;
4750 seq_printf(m, "%s\t%d\t%d\t%d\n", 5033 seq_printf(m, "%s\t%d\t%d\t%d\n",
4751 ss->name, ss->root->hierarchy_id, 5034 ss->name, ss->root->hierarchy_id,
4752 ss->root->number_of_cgroups, !ss->disabled); 5035 ss->root->number_of_cgroups, !ss->disabled);
4753 } 5036
4754 mutex_unlock(&cgroup_mutex); 5037 mutex_unlock(&cgroup_mutex);
4755 return 0; 5038 return 0;
4756} 5039}
@@ -4786,8 +5069,8 @@ static const struct file_operations proc_cgroupstats_operations = {
4786void cgroup_fork(struct task_struct *child) 5069void cgroup_fork(struct task_struct *child)
4787{ 5070{
4788 task_lock(current); 5071 task_lock(current);
5072 get_css_set(task_css_set(current));
4789 child->cgroups = current->cgroups; 5073 child->cgroups = current->cgroups;
4790 get_css_set(child->cgroups);
4791 task_unlock(current); 5074 task_unlock(current);
4792 INIT_LIST_HEAD(&child->cg_list); 5075 INIT_LIST_HEAD(&child->cg_list);
4793} 5076}
@@ -4804,6 +5087,7 @@ void cgroup_fork(struct task_struct *child)
4804 */ 5087 */
4805void cgroup_post_fork(struct task_struct *child) 5088void cgroup_post_fork(struct task_struct *child)
4806{ 5089{
5090 struct cgroup_subsys *ss;
4807 int i; 5091 int i;
4808 5092
4809 /* 5093 /*
@@ -4821,7 +5105,7 @@ void cgroup_post_fork(struct task_struct *child)
4821 write_lock(&css_set_lock); 5105 write_lock(&css_set_lock);
4822 task_lock(child); 5106 task_lock(child);
4823 if (list_empty(&child->cg_list)) 5107 if (list_empty(&child->cg_list))
4824 list_add(&child->cg_list, &child->cgroups->tasks); 5108 list_add(&child->cg_list, &task_css_set(child)->tasks);
4825 task_unlock(child); 5109 task_unlock(child);
4826 write_unlock(&css_set_lock); 5110 write_unlock(&css_set_lock);
4827 } 5111 }
@@ -4840,12 +5124,9 @@ void cgroup_post_fork(struct task_struct *child)
4840 * of the array can be freed at module unload, so we 5124 * of the array can be freed at module unload, so we
4841 * can't touch that. 5125 * can't touch that.
4842 */ 5126 */
4843 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 5127 for_each_builtin_subsys(ss, i)
4844 struct cgroup_subsys *ss = subsys[i];
4845
4846 if (ss->fork) 5128 if (ss->fork)
4847 ss->fork(child); 5129 ss->fork(child);
4848 }
4849 } 5130 }
4850} 5131}
4851 5132
@@ -4886,7 +5167,8 @@ void cgroup_post_fork(struct task_struct *child)
4886 */ 5167 */
4887void cgroup_exit(struct task_struct *tsk, int run_callbacks) 5168void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4888{ 5169{
4889 struct css_set *cg; 5170 struct cgroup_subsys *ss;
5171 struct css_set *cset;
4890 int i; 5172 int i;
4891 5173
4892 /* 5174 /*
@@ -4903,36 +5185,32 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4903 5185
4904 /* Reassign the task to the init_css_set. */ 5186 /* Reassign the task to the init_css_set. */
4905 task_lock(tsk); 5187 task_lock(tsk);
4906 cg = tsk->cgroups; 5188 cset = task_css_set(tsk);
4907 tsk->cgroups = &init_css_set; 5189 RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
4908 5190
4909 if (run_callbacks && need_forkexit_callback) { 5191 if (run_callbacks && need_forkexit_callback) {
4910 /* 5192 /*
4911 * fork/exit callbacks are supported only for builtin 5193 * fork/exit callbacks are supported only for builtin
4912 * subsystems, see cgroup_post_fork() for details. 5194 * subsystems, see cgroup_post_fork() for details.
4913 */ 5195 */
4914 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 5196 for_each_builtin_subsys(ss, i) {
4915 struct cgroup_subsys *ss = subsys[i];
4916
4917 if (ss->exit) { 5197 if (ss->exit) {
4918 struct cgroup *old_cgrp = 5198 struct cgroup *old_cgrp = cset->subsys[i]->cgroup;
4919 rcu_dereference_raw(cg->subsys[i])->cgroup;
4920 struct cgroup *cgrp = task_cgroup(tsk, i); 5199 struct cgroup *cgrp = task_cgroup(tsk, i);
5200
4921 ss->exit(cgrp, old_cgrp, tsk); 5201 ss->exit(cgrp, old_cgrp, tsk);
4922 } 5202 }
4923 } 5203 }
4924 } 5204 }
4925 task_unlock(tsk); 5205 task_unlock(tsk);
4926 5206
4927 put_css_set_taskexit(cg); 5207 put_css_set_taskexit(cset);
4928} 5208}
4929 5209
4930static void check_for_release(struct cgroup *cgrp) 5210static void check_for_release(struct cgroup *cgrp)
4931{ 5211{
4932 /* All of these checks rely on RCU to keep the cgroup
4933 * structure alive */
4934 if (cgroup_is_releasable(cgrp) && 5212 if (cgroup_is_releasable(cgrp) &&
4935 !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) { 5213 list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
4936 /* 5214 /*
4937 * Control Group is currently removeable. If it's not 5215 * Control Group is currently removeable. If it's not
4938 * already queued for a userspace notification, queue 5216 * already queued for a userspace notification, queue
@@ -4941,7 +5219,7 @@ static void check_for_release(struct cgroup *cgrp)
4941 int need_schedule_work = 0; 5219 int need_schedule_work = 0;
4942 5220
4943 raw_spin_lock(&release_list_lock); 5221 raw_spin_lock(&release_list_lock);
4944 if (!cgroup_is_removed(cgrp) && 5222 if (!cgroup_is_dead(cgrp) &&
4945 list_empty(&cgrp->release_list)) { 5223 list_empty(&cgrp->release_list)) {
4946 list_add(&cgrp->release_list, &release_list); 5224 list_add(&cgrp->release_list, &release_list);
4947 need_schedule_work = 1; 5225 need_schedule_work = 1;
@@ -4952,34 +5230,6 @@ static void check_for_release(struct cgroup *cgrp)
4952 } 5230 }
4953} 5231}
4954 5232
4955/* Caller must verify that the css is not for root cgroup */
4956bool __css_tryget(struct cgroup_subsys_state *css)
4957{
4958 while (true) {
4959 int t, v;
4960
4961 v = css_refcnt(css);
4962 t = atomic_cmpxchg(&css->refcnt, v, v + 1);
4963 if (likely(t == v))
4964 return true;
4965 else if (t < 0)
4966 return false;
4967 cpu_relax();
4968 }
4969}
4970EXPORT_SYMBOL_GPL(__css_tryget);
4971
4972/* Caller must verify that the css is not for root cgroup */
4973void __css_put(struct cgroup_subsys_state *css)
4974{
4975 int v;
4976
4977 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
4978 if (v == 0)
4979 schedule_work(&css->dput_work);
4980}
4981EXPORT_SYMBOL_GPL(__css_put);
4982
4983/* 5233/*
4984 * Notify userspace when a cgroup is released, by running the 5234 * Notify userspace when a cgroup is released, by running the
4985 * configured release agent with the name of the cgroup (path 5235 * configured release agent with the name of the cgroup (path
@@ -5054,23 +5304,19 @@ static void cgroup_release_agent(struct work_struct *work)
5054 5304
5055static int __init cgroup_disable(char *str) 5305static int __init cgroup_disable(char *str)
5056{ 5306{
5057 int i; 5307 struct cgroup_subsys *ss;
5058 char *token; 5308 char *token;
5309 int i;
5059 5310
5060 while ((token = strsep(&str, ",")) != NULL) { 5311 while ((token = strsep(&str, ",")) != NULL) {
5061 if (!*token) 5312 if (!*token)
5062 continue; 5313 continue;
5063 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
5064 struct cgroup_subsys *ss = subsys[i];
5065
5066 /*
5067 * cgroup_disable, being at boot time, can't
5068 * know about module subsystems, so we don't
5069 * worry about them.
5070 */
5071 if (!ss || ss->module)
5072 continue;
5073 5314
5315 /*
5316 * cgroup_disable, being at boot time, can't know about
5317 * module subsystems, so we don't worry about them.
5318 */
5319 for_each_builtin_subsys(ss, i) {
5074 if (!strcmp(token, ss->name)) { 5320 if (!strcmp(token, ss->name)) {
5075 ss->disabled = 1; 5321 ss->disabled = 1;
5076 printk(KERN_INFO "Disabling %s control group" 5322 printk(KERN_INFO "Disabling %s control group"
@@ -5087,9 +5333,7 @@ __setup("cgroup_disable=", cgroup_disable);
5087 * Functons for CSS ID. 5333 * Functons for CSS ID.
5088 */ 5334 */
5089 5335
5090/* 5336/* to get ID other than 0, this should be called when !cgroup_is_dead() */
5091 *To get ID other than 0, this should be called when !cgroup_is_removed().
5092 */
5093unsigned short css_id(struct cgroup_subsys_state *css) 5337unsigned short css_id(struct cgroup_subsys_state *css)
5094{ 5338{
5095 struct css_id *cssid; 5339 struct css_id *cssid;
@@ -5099,7 +5343,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
5099 * on this or this is under rcu_read_lock(). Once css->id is allocated, 5343 * on this or this is under rcu_read_lock(). Once css->id is allocated,
5100 * it's unchanged until freed. 5344 * it's unchanged until freed.
5101 */ 5345 */
5102 cssid = rcu_dereference_check(css->id, css_refcnt(css)); 5346 cssid = rcu_dereference_raw(css->id);
5103 5347
5104 if (cssid) 5348 if (cssid)
5105 return cssid->id; 5349 return cssid->id;
@@ -5107,18 +5351,6 @@ unsigned short css_id(struct cgroup_subsys_state *css)
5107} 5351}
5108EXPORT_SYMBOL_GPL(css_id); 5352EXPORT_SYMBOL_GPL(css_id);
5109 5353
5110unsigned short css_depth(struct cgroup_subsys_state *css)
5111{
5112 struct css_id *cssid;
5113
5114 cssid = rcu_dereference_check(css->id, css_refcnt(css));
5115
5116 if (cssid)
5117 return cssid->depth;
5118 return 0;
5119}
5120EXPORT_SYMBOL_GPL(css_depth);
5121
5122/** 5354/**
5123 * css_is_ancestor - test "root" css is an ancestor of "child" 5355 * css_is_ancestor - test "root" css is an ancestor of "child"
5124 * @child: the css to be tested. 5356 * @child: the css to be tested.
@@ -5153,7 +5385,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
5153 5385
5154void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) 5386void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
5155{ 5387{
5156 struct css_id *id = css->id; 5388 struct css_id *id = rcu_dereference_protected(css->id, true);
5389
5157 /* When this is called before css_id initialization, id can be NULL */ 5390 /* When this is called before css_id initialization, id can be NULL */
5158 if (!id) 5391 if (!id)
5159 return; 5392 return;
@@ -5219,8 +5452,8 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
5219 return PTR_ERR(newid); 5452 return PTR_ERR(newid);
5220 5453
5221 newid->stack[0] = newid->id; 5454 newid->stack[0] = newid->id;
5222 newid->css = rootcss; 5455 RCU_INIT_POINTER(newid->css, rootcss);
5223 rootcss->id = newid; 5456 RCU_INIT_POINTER(rootcss->id, newid);
5224 return 0; 5457 return 0;
5225} 5458}
5226 5459
@@ -5234,7 +5467,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
5234 subsys_id = ss->subsys_id; 5467 subsys_id = ss->subsys_id;
5235 parent_css = parent->subsys[subsys_id]; 5468 parent_css = parent->subsys[subsys_id];
5236 child_css = child->subsys[subsys_id]; 5469 child_css = child->subsys[subsys_id];
5237 parent_id = parent_css->id; 5470 parent_id = rcu_dereference_protected(parent_css->id, true);
5238 depth = parent_id->depth + 1; 5471 depth = parent_id->depth + 1;
5239 5472
5240 child_id = get_new_cssid(ss, depth); 5473 child_id = get_new_cssid(ss, depth);
@@ -5299,7 +5532,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5299} 5532}
5300 5533
5301#ifdef CONFIG_CGROUP_DEBUG 5534#ifdef CONFIG_CGROUP_DEBUG
5302static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) 5535static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
5303{ 5536{
5304 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5537 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5305 5538
@@ -5309,48 +5542,43 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
5309 return css; 5542 return css;
5310} 5543}
5311 5544
5312static void debug_css_free(struct cgroup *cont) 5545static void debug_css_free(struct cgroup *cgrp)
5313{
5314 kfree(cont->subsys[debug_subsys_id]);
5315}
5316
5317static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
5318{ 5546{
5319 return atomic_read(&cont->count); 5547 kfree(cgrp->subsys[debug_subsys_id]);
5320} 5548}
5321 5549
5322static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) 5550static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft)
5323{ 5551{
5324 return cgroup_task_count(cont); 5552 return cgroup_task_count(cgrp);
5325} 5553}
5326 5554
5327static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) 5555static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft)
5328{ 5556{
5329 return (u64)(unsigned long)current->cgroups; 5557 return (u64)(unsigned long)current->cgroups;
5330} 5558}
5331 5559
5332static u64 current_css_set_refcount_read(struct cgroup *cont, 5560static u64 current_css_set_refcount_read(struct cgroup *cgrp,
5333 struct cftype *cft) 5561 struct cftype *cft)
5334{ 5562{
5335 u64 count; 5563 u64 count;
5336 5564
5337 rcu_read_lock(); 5565 rcu_read_lock();
5338 count = atomic_read(&current->cgroups->refcount); 5566 count = atomic_read(&task_css_set(current)->refcount);
5339 rcu_read_unlock(); 5567 rcu_read_unlock();
5340 return count; 5568 return count;
5341} 5569}
5342 5570
5343static int current_css_set_cg_links_read(struct cgroup *cont, 5571static int current_css_set_cg_links_read(struct cgroup *cgrp,
5344 struct cftype *cft, 5572 struct cftype *cft,
5345 struct seq_file *seq) 5573 struct seq_file *seq)
5346{ 5574{
5347 struct cg_cgroup_link *link; 5575 struct cgrp_cset_link *link;
5348 struct css_set *cg; 5576 struct css_set *cset;
5349 5577
5350 read_lock(&css_set_lock); 5578 read_lock(&css_set_lock);
5351 rcu_read_lock(); 5579 rcu_read_lock();
5352 cg = rcu_dereference(current->cgroups); 5580 cset = rcu_dereference(current->cgroups);
5353 list_for_each_entry(link, &cg->cg_links, cg_link_list) { 5581 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
5354 struct cgroup *c = link->cgrp; 5582 struct cgroup *c = link->cgrp;
5355 const char *name; 5583 const char *name;
5356 5584
@@ -5367,19 +5595,19 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
5367} 5595}
5368 5596
5369#define MAX_TASKS_SHOWN_PER_CSS 25 5597#define MAX_TASKS_SHOWN_PER_CSS 25
5370static int cgroup_css_links_read(struct cgroup *cont, 5598static int cgroup_css_links_read(struct cgroup *cgrp,
5371 struct cftype *cft, 5599 struct cftype *cft,
5372 struct seq_file *seq) 5600 struct seq_file *seq)
5373{ 5601{
5374 struct cg_cgroup_link *link; 5602 struct cgrp_cset_link *link;
5375 5603
5376 read_lock(&css_set_lock); 5604 read_lock(&css_set_lock);
5377 list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { 5605 list_for_each_entry(link, &cgrp->cset_links, cset_link) {
5378 struct css_set *cg = link->cg; 5606 struct css_set *cset = link->cset;
5379 struct task_struct *task; 5607 struct task_struct *task;
5380 int count = 0; 5608 int count = 0;
5381 seq_printf(seq, "css_set %p\n", cg); 5609 seq_printf(seq, "css_set %p\n", cset);
5382 list_for_each_entry(task, &cg->tasks, cg_list) { 5610 list_for_each_entry(task, &cset->tasks, cg_list) {
5383 if (count++ > MAX_TASKS_SHOWN_PER_CSS) { 5611 if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
5384 seq_puts(seq, " ...\n"); 5612 seq_puts(seq, " ...\n");
5385 break; 5613 break;
@@ -5400,10 +5628,6 @@ static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
5400 5628
5401static struct cftype debug_files[] = { 5629static struct cftype debug_files[] = {
5402 { 5630 {
5403 .name = "cgroup_refcount",
5404 .read_u64 = cgroup_refcount_read,
5405 },
5406 {
5407 .name = "taskcount", 5631 .name = "taskcount",
5408 .read_u64 = debug_taskcount_read, 5632 .read_u64 = debug_taskcount_read,
5409 }, 5633 },
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 198a38883e64..b2b227b82123 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -366,7 +366,7 @@ EXPORT_SYMBOL(cpu_down);
366#endif /*CONFIG_HOTPLUG_CPU*/ 366#endif /*CONFIG_HOTPLUG_CPU*/
367 367
368/* Requires cpu_add_remove_lock to be held */ 368/* Requires cpu_add_remove_lock to be held */
369static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) 369static int _cpu_up(unsigned int cpu, int tasks_frozen)
370{ 370{
371 int ret, nr_calls = 0; 371 int ret, nr_calls = 0;
372 void *hcpu = (void *)(long)cpu; 372 void *hcpu = (void *)(long)cpu;
@@ -419,7 +419,7 @@ out:
419 return ret; 419 return ret;
420} 420}
421 421
422int __cpuinit cpu_up(unsigned int cpu) 422int cpu_up(unsigned int cpu)
423{ 423{
424 int err = 0; 424 int err = 0;
425 425
@@ -618,7 +618,7 @@ core_initcall(cpu_hotplug_pm_sync_init);
618 * It must be called by the arch code on the new cpu, before the new cpu 618 * It must be called by the arch code on the new cpu, before the new cpu
619 * enables interrupts and before the "boot" cpu returns from __cpu_up(). 619 * enables interrupts and before the "boot" cpu returns from __cpu_up().
620 */ 620 */
621void __cpuinit notify_cpu_starting(unsigned int cpu) 621void notify_cpu_starting(unsigned int cpu)
622{ 622{
623 unsigned long val = CPU_STARTING; 623 unsigned long val = CPU_STARTING;
624 624
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 64b3f791bbe5..e5657788fedd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -59,6 +59,7 @@
59#include <linux/mutex.h> 59#include <linux/mutex.h>
60#include <linux/workqueue.h> 60#include <linux/workqueue.h>
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
62#include <linux/wait.h>
62 63
63/* 64/*
64 * Tracks how many cpusets are currently defined in system. 65 * Tracks how many cpusets are currently defined in system.
@@ -87,6 +88,18 @@ struct cpuset {
87 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 88 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
88 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 89 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
89 90
91 /*
92 * This is old Memory Nodes tasks took on.
93 *
94 * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
95 * - A new cpuset's old_mems_allowed is initialized when some
96 * task is moved into it.
97 * - old_mems_allowed is used in cpuset_migrate_mm() when we change
98 * cpuset.mems_allowed and have tasks' nodemask updated, and
99 * then old_mems_allowed is updated to mems_allowed.
100 */
101 nodemask_t old_mems_allowed;
102
90 struct fmeter fmeter; /* memory_pressure filter */ 103 struct fmeter fmeter; /* memory_pressure filter */
91 104
92 /* 105 /*
@@ -100,14 +113,12 @@ struct cpuset {
100 113
101 /* for custom sched domain */ 114 /* for custom sched domain */
102 int relax_domain_level; 115 int relax_domain_level;
103
104 struct work_struct hotplug_work;
105}; 116};
106 117
107/* Retrieve the cpuset for a cgroup */ 118/* Retrieve the cpuset for a cgroup */
108static inline struct cpuset *cgroup_cs(struct cgroup *cont) 119static inline struct cpuset *cgroup_cs(struct cgroup *cgrp)
109{ 120{
110 return container_of(cgroup_subsys_state(cont, cpuset_subsys_id), 121 return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id),
111 struct cpuset, css); 122 struct cpuset, css);
112} 123}
113 124
@@ -267,14 +278,11 @@ static DEFINE_MUTEX(callback_mutex);
267/* 278/*
268 * CPU / memory hotplug is handled asynchronously. 279 * CPU / memory hotplug is handled asynchronously.
269 */ 280 */
270static struct workqueue_struct *cpuset_propagate_hotplug_wq;
271
272static void cpuset_hotplug_workfn(struct work_struct *work); 281static void cpuset_hotplug_workfn(struct work_struct *work);
273static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
274static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
275
276static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); 282static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
277 283
284static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
285
278/* 286/*
279 * This is ugly, but preserves the userspace API for existing cpuset 287 * This is ugly, but preserves the userspace API for existing cpuset
280 * users. If someone tries to mount the "cpuset" filesystem, we 288 * users. If someone tries to mount the "cpuset" filesystem, we
@@ -304,53 +312,38 @@ static struct file_system_type cpuset_fs_type = {
304/* 312/*
305 * Return in pmask the portion of a cpusets's cpus_allowed that 313 * Return in pmask the portion of a cpusets's cpus_allowed that
306 * are online. If none are online, walk up the cpuset hierarchy 314 * are online. If none are online, walk up the cpuset hierarchy
307 * until we find one that does have some online cpus. If we get 315 * until we find one that does have some online cpus. The top
308 * all the way to the top and still haven't found any online cpus, 316 * cpuset always has some cpus online.
309 * return cpu_online_mask. Or if passed a NULL cs from an exit'ing
310 * task, return cpu_online_mask.
311 * 317 *
312 * One way or another, we guarantee to return some non-empty subset 318 * One way or another, we guarantee to return some non-empty subset
313 * of cpu_online_mask. 319 * of cpu_online_mask.
314 * 320 *
315 * Call with callback_mutex held. 321 * Call with callback_mutex held.
316 */ 322 */
317
318static void guarantee_online_cpus(const struct cpuset *cs, 323static void guarantee_online_cpus(const struct cpuset *cs,
319 struct cpumask *pmask) 324 struct cpumask *pmask)
320{ 325{
321 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 326 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
322 cs = parent_cs(cs); 327 cs = parent_cs(cs);
323 if (cs) 328 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
324 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
325 else
326 cpumask_copy(pmask, cpu_online_mask);
327 BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
328} 329}
329 330
330/* 331/*
331 * Return in *pmask the portion of a cpusets's mems_allowed that 332 * Return in *pmask the portion of a cpusets's mems_allowed that
332 * are online, with memory. If none are online with memory, walk 333 * are online, with memory. If none are online with memory, walk
333 * up the cpuset hierarchy until we find one that does have some 334 * up the cpuset hierarchy until we find one that does have some
334 * online mems. If we get all the way to the top and still haven't 335 * online mems. The top cpuset always has some mems online.
335 * found any online mems, return node_states[N_MEMORY].
336 * 336 *
337 * One way or another, we guarantee to return some non-empty subset 337 * One way or another, we guarantee to return some non-empty subset
338 * of node_states[N_MEMORY]. 338 * of node_states[N_MEMORY].
339 * 339 *
340 * Call with callback_mutex held. 340 * Call with callback_mutex held.
341 */ 341 */
342
343static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 342static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
344{ 343{
345 while (cs && !nodes_intersects(cs->mems_allowed, 344 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
346 node_states[N_MEMORY]))
347 cs = parent_cs(cs); 345 cs = parent_cs(cs);
348 if (cs) 346 nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
349 nodes_and(*pmask, cs->mems_allowed,
350 node_states[N_MEMORY]);
351 else
352 *pmask = node_states[N_MEMORY];
353 BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY]));
354} 347}
355 348
356/* 349/*
@@ -440,7 +433,7 @@ static void free_trial_cpuset(struct cpuset *trial)
440 433
441static int validate_change(const struct cpuset *cur, const struct cpuset *trial) 434static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
442{ 435{
443 struct cgroup *cont; 436 struct cgroup *cgrp;
444 struct cpuset *c, *par; 437 struct cpuset *c, *par;
445 int ret; 438 int ret;
446 439
@@ -448,7 +441,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
448 441
449 /* Each of our child cpusets must be a subset of us */ 442 /* Each of our child cpusets must be a subset of us */
450 ret = -EBUSY; 443 ret = -EBUSY;
451 cpuset_for_each_child(c, cont, cur) 444 cpuset_for_each_child(c, cgrp, cur)
452 if (!is_cpuset_subset(c, trial)) 445 if (!is_cpuset_subset(c, trial))
453 goto out; 446 goto out;
454 447
@@ -469,7 +462,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
469 * overlap 462 * overlap
470 */ 463 */
471 ret = -EINVAL; 464 ret = -EINVAL;
472 cpuset_for_each_child(c, cont, par) { 465 cpuset_for_each_child(c, cgrp, par) {
473 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 466 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
474 c != cur && 467 c != cur &&
475 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 468 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
@@ -486,7 +479,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
486 */ 479 */
487 ret = -ENOSPC; 480 ret = -ENOSPC;
488 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && 481 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
489 (cpumask_empty(trial->cpus_allowed) || 482 (cpumask_empty(trial->cpus_allowed) &&
490 nodes_empty(trial->mems_allowed))) 483 nodes_empty(trial->mems_allowed)))
491 goto out; 484 goto out;
492 485
@@ -540,7 +533,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
540 * This function builds a partial partition of the systems CPUs 533 * This function builds a partial partition of the systems CPUs
541 * A 'partial partition' is a set of non-overlapping subsets whose 534 * A 'partial partition' is a set of non-overlapping subsets whose
542 * union is a subset of that set. 535 * union is a subset of that set.
543 * The output of this function needs to be passed to kernel/sched.c 536 * The output of this function needs to be passed to kernel/sched/core.c
544 * partition_sched_domains() routine, which will rebuild the scheduler's 537 * partition_sched_domains() routine, which will rebuild the scheduler's
545 * load balancing domains (sched domains) as specified by that partial 538 * load balancing domains (sched domains) as specified by that partial
546 * partition. 539 * partition.
@@ -569,7 +562,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
569 * is a subset of one of these domains, while there are as 562 * is a subset of one of these domains, while there are as
570 * many such domains as possible, each as small as possible. 563 * many such domains as possible, each as small as possible.
571 * doms - Conversion of 'csa' to an array of cpumasks, for passing to 564 * doms - Conversion of 'csa' to an array of cpumasks, for passing to
572 * the kernel/sched.c routine partition_sched_domains() in a 565 * the kernel/sched/core.c routine partition_sched_domains() in a
573 * convenient format, that can be easily compared to the prior 566 * convenient format, that can be easily compared to the prior
574 * value to determine what partition elements (sched domains) 567 * value to determine what partition elements (sched domains)
575 * were changed (added or removed.) 568 * were changed (added or removed.)
@@ -798,21 +791,43 @@ void rebuild_sched_domains(void)
798 mutex_unlock(&cpuset_mutex); 791 mutex_unlock(&cpuset_mutex);
799} 792}
800 793
801/** 794/*
802 * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's 795 * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
803 * @tsk: task to test 796 * @cs: the cpuset in interest
804 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
805 * 797 *
806 * Call with cpuset_mutex held. May take callback_mutex during call. 798 * A cpuset's effective cpumask is the cpumask of the nearest ancestor
807 * Called for each task in a cgroup by cgroup_scan_tasks(). 799 * with non-empty cpus. We use effective cpumask whenever:
808 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other 800 * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
809 * words, if its mask is not equal to its cpuset's mask). 801 * if the cpuset they reside in has no cpus)
802 * - we want to retrieve task_cs(tsk)'s cpus_allowed.
803 *
804 * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
805 * exception. See comments there.
810 */ 806 */
811static int cpuset_test_cpumask(struct task_struct *tsk, 807static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
812 struct cgroup_scanner *scan)
813{ 808{
814 return !cpumask_equal(&tsk->cpus_allowed, 809 while (cpumask_empty(cs->cpus_allowed))
815 (cgroup_cs(scan->cg))->cpus_allowed); 810 cs = parent_cs(cs);
811 return cs;
812}
813
814/*
815 * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
816 * @cs: the cpuset in interest
817 *
818 * A cpuset's effective nodemask is the nodemask of the nearest ancestor
819 * with non-empty memss. We use effective nodemask whenever:
820 * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
821 * if the cpuset they reside in has no mems)
822 * - we want to retrieve task_cs(tsk)'s mems_allowed.
823 *
824 * Called with cpuset_mutex held.
825 */
826static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
827{
828 while (nodes_empty(cs->mems_allowed))
829 cs = parent_cs(cs);
830 return cs;
816} 831}
817 832
818/** 833/**
@@ -829,7 +844,10 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
829static void cpuset_change_cpumask(struct task_struct *tsk, 844static void cpuset_change_cpumask(struct task_struct *tsk,
830 struct cgroup_scanner *scan) 845 struct cgroup_scanner *scan)
831{ 846{
832 set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed)); 847 struct cpuset *cpus_cs;
848
849 cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg));
850 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
833} 851}
834 852
835/** 853/**
@@ -850,12 +868,51 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
850 struct cgroup_scanner scan; 868 struct cgroup_scanner scan;
851 869
852 scan.cg = cs->css.cgroup; 870 scan.cg = cs->css.cgroup;
853 scan.test_task = cpuset_test_cpumask; 871 scan.test_task = NULL;
854 scan.process_task = cpuset_change_cpumask; 872 scan.process_task = cpuset_change_cpumask;
855 scan.heap = heap; 873 scan.heap = heap;
856 cgroup_scan_tasks(&scan); 874 cgroup_scan_tasks(&scan);
857} 875}
858 876
877/*
878 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
879 * @root_cs: the root cpuset of the hierarchy
880 * @update_root: update root cpuset or not?
881 * @heap: the heap used by cgroup_scan_tasks()
882 *
883 * This will update cpumasks of tasks in @root_cs and all other empty cpusets
884 * which take on cpumask of @root_cs.
885 *
886 * Called with cpuset_mutex held
887 */
888static void update_tasks_cpumask_hier(struct cpuset *root_cs,
889 bool update_root, struct ptr_heap *heap)
890{
891 struct cpuset *cp;
892 struct cgroup *pos_cgrp;
893
894 if (update_root)
895 update_tasks_cpumask(root_cs, heap);
896
897 rcu_read_lock();
898 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
899 /* skip the whole subtree if @cp have some CPU */
900 if (!cpumask_empty(cp->cpus_allowed)) {
901 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
902 continue;
903 }
904 if (!css_tryget(&cp->css))
905 continue;
906 rcu_read_unlock();
907
908 update_tasks_cpumask(cp, heap);
909
910 rcu_read_lock();
911 css_put(&cp->css);
912 }
913 rcu_read_unlock();
914}
915
859/** 916/**
860 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 917 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
861 * @cs: the cpuset to consider 918 * @cs: the cpuset to consider
@@ -888,14 +945,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
888 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) 945 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
889 return -EINVAL; 946 return -EINVAL;
890 } 947 }
891 retval = validate_change(cs, trialcs);
892 if (retval < 0)
893 return retval;
894 948
895 /* Nothing to do if the cpus didn't change */ 949 /* Nothing to do if the cpus didn't change */
896 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) 950 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
897 return 0; 951 return 0;
898 952
953 retval = validate_change(cs, trialcs);
954 if (retval < 0)
955 return retval;
956
899 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); 957 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
900 if (retval) 958 if (retval)
901 return retval; 959 return retval;
@@ -906,11 +964,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
906 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 964 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
907 mutex_unlock(&callback_mutex); 965 mutex_unlock(&callback_mutex);
908 966
909 /* 967 update_tasks_cpumask_hier(cs, true, &heap);
910 * Scan tasks in the cpuset, and update the cpumasks of any
911 * that need an update.
912 */
913 update_tasks_cpumask(cs, &heap);
914 968
915 heap_free(&heap); 969 heap_free(&heap);
916 970
@@ -943,12 +997,14 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
943 const nodemask_t *to) 997 const nodemask_t *to)
944{ 998{
945 struct task_struct *tsk = current; 999 struct task_struct *tsk = current;
1000 struct cpuset *mems_cs;
946 1001
947 tsk->mems_allowed = *to; 1002 tsk->mems_allowed = *to;
948 1003
949 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 1004 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
950 1005
951 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); 1006 mems_cs = effective_nodemask_cpuset(task_cs(tsk));
1007 guarantee_online_mems(mems_cs, &tsk->mems_allowed);
952} 1008}
953 1009
954/* 1010/*
@@ -1007,16 +1063,12 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1007static void cpuset_change_nodemask(struct task_struct *p, 1063static void cpuset_change_nodemask(struct task_struct *p,
1008 struct cgroup_scanner *scan) 1064 struct cgroup_scanner *scan)
1009{ 1065{
1066 struct cpuset *cs = cgroup_cs(scan->cg);
1010 struct mm_struct *mm; 1067 struct mm_struct *mm;
1011 struct cpuset *cs;
1012 int migrate; 1068 int migrate;
1013 const nodemask_t *oldmem = scan->data; 1069 nodemask_t *newmems = scan->data;
1014 static nodemask_t newmems; /* protected by cpuset_mutex */
1015
1016 cs = cgroup_cs(scan->cg);
1017 guarantee_online_mems(cs, &newmems);
1018 1070
1019 cpuset_change_task_nodemask(p, &newmems); 1071 cpuset_change_task_nodemask(p, newmems);
1020 1072
1021 mm = get_task_mm(p); 1073 mm = get_task_mm(p);
1022 if (!mm) 1074 if (!mm)
@@ -1026,7 +1078,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
1026 1078
1027 mpol_rebind_mm(mm, &cs->mems_allowed); 1079 mpol_rebind_mm(mm, &cs->mems_allowed);
1028 if (migrate) 1080 if (migrate)
1029 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); 1081 cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems);
1030 mmput(mm); 1082 mmput(mm);
1031} 1083}
1032 1084
@@ -1035,25 +1087,27 @@ static void *cpuset_being_rebound;
1035/** 1087/**
1036 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1088 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1037 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1089 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1038 * @oldmem: old mems_allowed of cpuset cs
1039 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1090 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1040 * 1091 *
1041 * Called with cpuset_mutex held 1092 * Called with cpuset_mutex held
1042 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1093 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1043 * if @heap != NULL. 1094 * if @heap != NULL.
1044 */ 1095 */
1045static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, 1096static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1046 struct ptr_heap *heap)
1047{ 1097{
1098 static nodemask_t newmems; /* protected by cpuset_mutex */
1048 struct cgroup_scanner scan; 1099 struct cgroup_scanner scan;
1100 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1049 1101
1050 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1102 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1051 1103
1104 guarantee_online_mems(mems_cs, &newmems);
1105
1052 scan.cg = cs->css.cgroup; 1106 scan.cg = cs->css.cgroup;
1053 scan.test_task = NULL; 1107 scan.test_task = NULL;
1054 scan.process_task = cpuset_change_nodemask; 1108 scan.process_task = cpuset_change_nodemask;
1055 scan.heap = heap; 1109 scan.heap = heap;
1056 scan.data = (nodemask_t *)oldmem; 1110 scan.data = &newmems;
1057 1111
1058 /* 1112 /*
1059 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't 1113 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
@@ -1067,11 +1121,56 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1067 */ 1121 */
1068 cgroup_scan_tasks(&scan); 1122 cgroup_scan_tasks(&scan);
1069 1123
1124 /*
1125 * All the tasks' nodemasks have been updated, update
1126 * cs->old_mems_allowed.
1127 */
1128 cs->old_mems_allowed = newmems;
1129
1070 /* We're done rebinding vmas to this cpuset's new mems_allowed. */ 1130 /* We're done rebinding vmas to this cpuset's new mems_allowed. */
1071 cpuset_being_rebound = NULL; 1131 cpuset_being_rebound = NULL;
1072} 1132}
1073 1133
1074/* 1134/*
1135 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
1136 * @cs: the root cpuset of the hierarchy
1137 * @update_root: update the root cpuset or not?
1138 * @heap: the heap used by cgroup_scan_tasks()
1139 *
1140 * This will update nodemasks of tasks in @root_cs and all other empty cpusets
1141 * which take on nodemask of @root_cs.
1142 *
1143 * Called with cpuset_mutex held
1144 */
1145static void update_tasks_nodemask_hier(struct cpuset *root_cs,
1146 bool update_root, struct ptr_heap *heap)
1147{
1148 struct cpuset *cp;
1149 struct cgroup *pos_cgrp;
1150
1151 if (update_root)
1152 update_tasks_nodemask(root_cs, heap);
1153
1154 rcu_read_lock();
1155 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
1156 /* skip the whole subtree if @cp have some CPU */
1157 if (!nodes_empty(cp->mems_allowed)) {
1158 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
1159 continue;
1160 }
1161 if (!css_tryget(&cp->css))
1162 continue;
1163 rcu_read_unlock();
1164
1165 update_tasks_nodemask(cp, heap);
1166
1167 rcu_read_lock();
1168 css_put(&cp->css);
1169 }
1170 rcu_read_unlock();
1171}
1172
1173/*
1075 * Handle user request to change the 'mems' memory placement 1174 * Handle user request to change the 'mems' memory placement
1076 * of a cpuset. Needs to validate the request, update the 1175 * of a cpuset. Needs to validate the request, update the
1077 * cpusets mems_allowed, and for each task in the cpuset, 1176 * cpusets mems_allowed, and for each task in the cpuset,
@@ -1087,13 +1186,9 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1087static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, 1186static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1088 const char *buf) 1187 const char *buf)
1089{ 1188{
1090 NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
1091 int retval; 1189 int retval;
1092 struct ptr_heap heap; 1190 struct ptr_heap heap;
1093 1191
1094 if (!oldmem)
1095 return -ENOMEM;
1096
1097 /* 1192 /*
1098 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; 1193 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
1099 * it's read-only 1194 * it's read-only
@@ -1122,8 +1217,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1122 goto done; 1217 goto done;
1123 } 1218 }
1124 } 1219 }
1125 *oldmem = cs->mems_allowed; 1220
1126 if (nodes_equal(*oldmem, trialcs->mems_allowed)) { 1221 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1127 retval = 0; /* Too easy - nothing to do */ 1222 retval = 0; /* Too easy - nothing to do */
1128 goto done; 1223 goto done;
1129 } 1224 }
@@ -1139,11 +1234,10 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1139 cs->mems_allowed = trialcs->mems_allowed; 1234 cs->mems_allowed = trialcs->mems_allowed;
1140 mutex_unlock(&callback_mutex); 1235 mutex_unlock(&callback_mutex);
1141 1236
1142 update_tasks_nodemask(cs, oldmem, &heap); 1237 update_tasks_nodemask_hier(cs, true, &heap);
1143 1238
1144 heap_free(&heap); 1239 heap_free(&heap);
1145done: 1240done:
1146 NODEMASK_FREE(oldmem);
1147 return retval; 1241 return retval;
1148} 1242}
1149 1243
@@ -1372,8 +1466,13 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1372 1466
1373 mutex_lock(&cpuset_mutex); 1467 mutex_lock(&cpuset_mutex);
1374 1468
1469 /*
1470 * We allow to move tasks into an empty cpuset if sane_behavior
1471 * flag is set.
1472 */
1375 ret = -ENOSPC; 1473 ret = -ENOSPC;
1376 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1474 if (!cgroup_sane_behavior(cgrp) &&
1475 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1377 goto out_unlock; 1476 goto out_unlock;
1378 1477
1379 cgroup_taskset_for_each(task, cgrp, tset) { 1478 cgroup_taskset_for_each(task, cgrp, tset) {
@@ -1422,8 +1521,7 @@ static cpumask_var_t cpus_attach;
1422 1521
1423static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1522static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1424{ 1523{
1425 /* static bufs protected by cpuset_mutex */ 1524 /* static buf protected by cpuset_mutex */
1426 static nodemask_t cpuset_attach_nodemask_from;
1427 static nodemask_t cpuset_attach_nodemask_to; 1525 static nodemask_t cpuset_attach_nodemask_to;
1428 struct mm_struct *mm; 1526 struct mm_struct *mm;
1429 struct task_struct *task; 1527 struct task_struct *task;
@@ -1431,6 +1529,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1431 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); 1529 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
1432 struct cpuset *cs = cgroup_cs(cgrp); 1530 struct cpuset *cs = cgroup_cs(cgrp);
1433 struct cpuset *oldcs = cgroup_cs(oldcgrp); 1531 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1532 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1533 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1434 1534
1435 mutex_lock(&cpuset_mutex); 1535 mutex_lock(&cpuset_mutex);
1436 1536
@@ -1438,9 +1538,9 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1438 if (cs == &top_cpuset) 1538 if (cs == &top_cpuset)
1439 cpumask_copy(cpus_attach, cpu_possible_mask); 1539 cpumask_copy(cpus_attach, cpu_possible_mask);
1440 else 1540 else
1441 guarantee_online_cpus(cs, cpus_attach); 1541 guarantee_online_cpus(cpus_cs, cpus_attach);
1442 1542
1443 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 1543 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
1444 1544
1445 cgroup_taskset_for_each(task, cgrp, tset) { 1545 cgroup_taskset_for_each(task, cgrp, tset) {
1446 /* 1546 /*
@@ -1457,26 +1557,32 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1457 * Change mm, possibly for multiple threads in a threadgroup. This is 1557 * Change mm, possibly for multiple threads in a threadgroup. This is
1458 * expensive and may sleep. 1558 * expensive and may sleep.
1459 */ 1559 */
1460 cpuset_attach_nodemask_from = oldcs->mems_allowed;
1461 cpuset_attach_nodemask_to = cs->mems_allowed; 1560 cpuset_attach_nodemask_to = cs->mems_allowed;
1462 mm = get_task_mm(leader); 1561 mm = get_task_mm(leader);
1463 if (mm) { 1562 if (mm) {
1563 struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
1564
1464 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 1565 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1465 if (is_memory_migrate(cs)) 1566
1466 cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, 1567 /*
1568 * old_mems_allowed is the same with mems_allowed here, except
1569 * if this task is being moved automatically due to hotplug.
1570 * In that case @mems_allowed has been updated and is empty,
1571 * so @old_mems_allowed is the right nodesets that we migrate
1572 * mm from.
1573 */
1574 if (is_memory_migrate(cs)) {
1575 cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
1467 &cpuset_attach_nodemask_to); 1576 &cpuset_attach_nodemask_to);
1577 }
1468 mmput(mm); 1578 mmput(mm);
1469 } 1579 }
1470 1580
1471 cs->attach_in_progress--; 1581 cs->old_mems_allowed = cpuset_attach_nodemask_to;
1472 1582
1473 /* 1583 cs->attach_in_progress--;
1474 * We may have raced with CPU/memory hotunplug. Trigger hotplug 1584 if (!cs->attach_in_progress)
1475 * propagation if @cs doesn't have any CPU or memory. It will move 1585 wake_up(&cpuset_attach_wq);
1476 * the newly added tasks to the nearest parent which can execute.
1477 */
1478 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1479 schedule_cpuset_propagate_hotplug(cs);
1480 1586
1481 mutex_unlock(&cpuset_mutex); 1587 mutex_unlock(&cpuset_mutex);
1482} 1588}
@@ -1588,13 +1694,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1588 * resources, wait for the previously scheduled operations before 1694 * resources, wait for the previously scheduled operations before
1589 * proceeding, so that we don't end up keep removing tasks added 1695 * proceeding, so that we don't end up keep removing tasks added
1590 * after execution capability is restored. 1696 * after execution capability is restored.
1591 *
1592 * Flushing cpuset_hotplug_work is enough to synchronize against
1593 * hotplug hanlding; however, cpuset_attach() may schedule
1594 * propagation work directly. Flush the workqueue too.
1595 */ 1697 */
1596 flush_work(&cpuset_hotplug_work); 1698 flush_work(&cpuset_hotplug_work);
1597 flush_workqueue(cpuset_propagate_hotplug_wq);
1598 1699
1599 mutex_lock(&cpuset_mutex); 1700 mutex_lock(&cpuset_mutex);
1600 if (!is_cpuset_online(cs)) 1701 if (!is_cpuset_online(cs))
@@ -1658,13 +1759,13 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1658 return count; 1759 return count;
1659} 1760}
1660 1761
1661static ssize_t cpuset_common_file_read(struct cgroup *cont, 1762static ssize_t cpuset_common_file_read(struct cgroup *cgrp,
1662 struct cftype *cft, 1763 struct cftype *cft,
1663 struct file *file, 1764 struct file *file,
1664 char __user *buf, 1765 char __user *buf,
1665 size_t nbytes, loff_t *ppos) 1766 size_t nbytes, loff_t *ppos)
1666{ 1767{
1667 struct cpuset *cs = cgroup_cs(cont); 1768 struct cpuset *cs = cgroup_cs(cgrp);
1668 cpuset_filetype_t type = cft->private; 1769 cpuset_filetype_t type = cft->private;
1669 char *page; 1770 char *page;
1670 ssize_t retval = 0; 1771 ssize_t retval = 0;
@@ -1694,9 +1795,9 @@ out:
1694 return retval; 1795 return retval;
1695} 1796}
1696 1797
1697static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) 1798static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft)
1698{ 1799{
1699 struct cpuset *cs = cgroup_cs(cont); 1800 struct cpuset *cs = cgroup_cs(cgrp);
1700 cpuset_filetype_t type = cft->private; 1801 cpuset_filetype_t type = cft->private;
1701 switch (type) { 1802 switch (type) {
1702 case FILE_CPU_EXCLUSIVE: 1803 case FILE_CPU_EXCLUSIVE:
@@ -1725,9 +1826,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1725 return 0; 1826 return 0;
1726} 1827}
1727 1828
1728static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) 1829static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft)
1729{ 1830{
1730 struct cpuset *cs = cgroup_cs(cont); 1831 struct cpuset *cs = cgroup_cs(cgrp);
1731 cpuset_filetype_t type = cft->private; 1832 cpuset_filetype_t type = cft->private;
1732 switch (type) { 1833 switch (type) {
1733 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1834 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1839,14 +1940,14 @@ static struct cftype files[] = {
1839 1940
1840/* 1941/*
1841 * cpuset_css_alloc - allocate a cpuset css 1942 * cpuset_css_alloc - allocate a cpuset css
1842 * cont: control group that the new cpuset will be part of 1943 * cgrp: control group that the new cpuset will be part of
1843 */ 1944 */
1844 1945
1845static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) 1946static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp)
1846{ 1947{
1847 struct cpuset *cs; 1948 struct cpuset *cs;
1848 1949
1849 if (!cont->parent) 1950 if (!cgrp->parent)
1850 return &top_cpuset.css; 1951 return &top_cpuset.css;
1851 1952
1852 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 1953 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
@@ -1861,7 +1962,6 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1861 cpumask_clear(cs->cpus_allowed); 1962 cpumask_clear(cs->cpus_allowed);
1862 nodes_clear(cs->mems_allowed); 1963 nodes_clear(cs->mems_allowed);
1863 fmeter_init(&cs->fmeter); 1964 fmeter_init(&cs->fmeter);
1864 INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
1865 cs->relax_domain_level = -1; 1965 cs->relax_domain_level = -1;
1866 1966
1867 return &cs->css; 1967 return &cs->css;
@@ -1942,9 +2042,9 @@ static void cpuset_css_offline(struct cgroup *cgrp)
1942 * will call rebuild_sched_domains_locked(). 2042 * will call rebuild_sched_domains_locked().
1943 */ 2043 */
1944 2044
1945static void cpuset_css_free(struct cgroup *cont) 2045static void cpuset_css_free(struct cgroup *cgrp)
1946{ 2046{
1947 struct cpuset *cs = cgroup_cs(cont); 2047 struct cpuset *cs = cgroup_cs(cgrp);
1948 2048
1949 free_cpumask_var(cs->cpus_allowed); 2049 free_cpumask_var(cs->cpus_allowed);
1950 kfree(cs); 2050 kfree(cs);
@@ -2024,41 +2124,64 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2024} 2124}
2025 2125
2026/** 2126/**
2027 * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset 2127 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
2028 * @cs: cpuset in interest 2128 * @cs: cpuset in interest
2029 * 2129 *
2030 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone 2130 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
2031 * offline, update @cs accordingly. If @cs ends up with no CPU or memory, 2131 * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
2032 * all its tasks are moved to the nearest ancestor with both resources. 2132 * all its tasks are moved to the nearest ancestor with both resources.
2033 */ 2133 */
2034static void cpuset_propagate_hotplug_workfn(struct work_struct *work) 2134static void cpuset_hotplug_update_tasks(struct cpuset *cs)
2035{ 2135{
2036 static cpumask_t off_cpus; 2136 static cpumask_t off_cpus;
2037 static nodemask_t off_mems, tmp_mems; 2137 static nodemask_t off_mems;
2038 struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
2039 bool is_empty; 2138 bool is_empty;
2139 bool sane = cgroup_sane_behavior(cs->css.cgroup);
2140
2141retry:
2142 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
2040 2143
2041 mutex_lock(&cpuset_mutex); 2144 mutex_lock(&cpuset_mutex);
2042 2145
2146 /*
2147 * We have raced with task attaching. We wait until attaching
2148 * is finished, so we won't attach a task to an empty cpuset.
2149 */
2150 if (cs->attach_in_progress) {
2151 mutex_unlock(&cpuset_mutex);
2152 goto retry;
2153 }
2154
2043 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); 2155 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
2044 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); 2156 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
2045 2157
2046 /* remove offline cpus from @cs */ 2158 mutex_lock(&callback_mutex);
2047 if (!cpumask_empty(&off_cpus)) { 2159 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
2048 mutex_lock(&callback_mutex); 2160 mutex_unlock(&callback_mutex);
2049 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); 2161
2050 mutex_unlock(&callback_mutex); 2162 /*
2163 * If sane_behavior flag is set, we need to update tasks' cpumask
2164 * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
2165 * call update_tasks_cpumask() if the cpuset becomes empty, as
2166 * the tasks in it will be migrated to an ancestor.
2167 */
2168 if ((sane && cpumask_empty(cs->cpus_allowed)) ||
2169 (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
2051 update_tasks_cpumask(cs, NULL); 2170 update_tasks_cpumask(cs, NULL);
2052 }
2053 2171
2054 /* remove offline mems from @cs */ 2172 mutex_lock(&callback_mutex);
2055 if (!nodes_empty(off_mems)) { 2173 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
2056 tmp_mems = cs->mems_allowed; 2174 mutex_unlock(&callback_mutex);
2057 mutex_lock(&callback_mutex); 2175
2058 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); 2176 /*
2059 mutex_unlock(&callback_mutex); 2177 * If sane_behavior flag is set, we need to update tasks' nodemask
2060 update_tasks_nodemask(cs, &tmp_mems, NULL); 2178 * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
2061 } 2179 * call update_tasks_nodemask() if the cpuset becomes empty, as
2180 * the tasks in it will be migratd to an ancestor.
2181 */
2182 if ((sane && nodes_empty(cs->mems_allowed)) ||
2183 (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
2184 update_tasks_nodemask(cs, NULL);
2062 2185
2063 is_empty = cpumask_empty(cs->cpus_allowed) || 2186 is_empty = cpumask_empty(cs->cpus_allowed) ||
2064 nodes_empty(cs->mems_allowed); 2187 nodes_empty(cs->mems_allowed);
@@ -2066,40 +2189,14 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
2066 mutex_unlock(&cpuset_mutex); 2189 mutex_unlock(&cpuset_mutex);
2067 2190
2068 /* 2191 /*
2069 * If @cs became empty, move tasks to the nearest ancestor with 2192 * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
2070 * execution resources. This is full cgroup operation which will 2193 *
2194 * Otherwise move tasks to the nearest ancestor with execution
2195 * resources. This is full cgroup operation which will
2071 * also call back into cpuset. Should be done outside any lock. 2196 * also call back into cpuset. Should be done outside any lock.
2072 */ 2197 */
2073 if (is_empty) 2198 if (!sane && is_empty)
2074 remove_tasks_in_empty_cpuset(cs); 2199 remove_tasks_in_empty_cpuset(cs);
2075
2076 /* the following may free @cs, should be the last operation */
2077 css_put(&cs->css);
2078}
2079
2080/**
2081 * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
2082 * @cs: cpuset of interest
2083 *
2084 * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
2085 * memory masks according to top_cpuset.
2086 */
2087static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
2088{
2089 /*
2090 * Pin @cs. The refcnt will be released when the work item
2091 * finishes executing.
2092 */
2093 if (!css_tryget(&cs->css))
2094 return;
2095
2096 /*
2097 * Queue @cs->hotplug_work. If already pending, lose the css ref.
2098 * cpuset_propagate_hotplug_wq is ordered and propagation will
2099 * happen in the order this function is called.
2100 */
2101 if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
2102 css_put(&cs->css);
2103} 2200}
2104 2201
2105/** 2202/**
@@ -2112,18 +2209,17 @@ static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
2112 * actively using CPU hotplug but making no active use of cpusets. 2209 * actively using CPU hotplug but making no active use of cpusets.
2113 * 2210 *
2114 * Non-root cpusets are only affected by offlining. If any CPUs or memory 2211 * Non-root cpusets are only affected by offlining. If any CPUs or memory
2115 * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all 2212 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
2116 * descendants. 2213 * all descendants.
2117 * 2214 *
2118 * Note that CPU offlining during suspend is ignored. We don't modify 2215 * Note that CPU offlining during suspend is ignored. We don't modify
2119 * cpusets across suspend/resume cycles at all. 2216 * cpusets across suspend/resume cycles at all.
2120 */ 2217 */
2121static void cpuset_hotplug_workfn(struct work_struct *work) 2218static void cpuset_hotplug_workfn(struct work_struct *work)
2122{ 2219{
2123 static cpumask_t new_cpus, tmp_cpus; 2220 static cpumask_t new_cpus;
2124 static nodemask_t new_mems, tmp_mems; 2221 static nodemask_t new_mems;
2125 bool cpus_updated, mems_updated; 2222 bool cpus_updated, mems_updated;
2126 bool cpus_offlined, mems_offlined;
2127 2223
2128 mutex_lock(&cpuset_mutex); 2224 mutex_lock(&cpuset_mutex);
2129 2225
@@ -2132,12 +2228,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2132 new_mems = node_states[N_MEMORY]; 2228 new_mems = node_states[N_MEMORY];
2133 2229
2134 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); 2230 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
2135 cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
2136 &new_cpus);
2137
2138 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); 2231 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
2139 nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
2140 mems_offlined = !nodes_empty(tmp_mems);
2141 2232
2142 /* synchronize cpus_allowed to cpu_active_mask */ 2233 /* synchronize cpus_allowed to cpu_active_mask */
2143 if (cpus_updated) { 2234 if (cpus_updated) {
@@ -2149,28 +2240,32 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2149 2240
2150 /* synchronize mems_allowed to N_MEMORY */ 2241 /* synchronize mems_allowed to N_MEMORY */
2151 if (mems_updated) { 2242 if (mems_updated) {
2152 tmp_mems = top_cpuset.mems_allowed;
2153 mutex_lock(&callback_mutex); 2243 mutex_lock(&callback_mutex);
2154 top_cpuset.mems_allowed = new_mems; 2244 top_cpuset.mems_allowed = new_mems;
2155 mutex_unlock(&callback_mutex); 2245 mutex_unlock(&callback_mutex);
2156 update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); 2246 update_tasks_nodemask(&top_cpuset, NULL);
2157 } 2247 }
2158 2248
2159 /* if cpus or mems went down, we need to propagate to descendants */ 2249 mutex_unlock(&cpuset_mutex);
2160 if (cpus_offlined || mems_offlined) { 2250
2251 /* if cpus or mems changed, we need to propagate to descendants */
2252 if (cpus_updated || mems_updated) {
2161 struct cpuset *cs; 2253 struct cpuset *cs;
2162 struct cgroup *pos_cgrp; 2254 struct cgroup *pos_cgrp;
2163 2255
2164 rcu_read_lock(); 2256 rcu_read_lock();
2165 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) 2257 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) {
2166 schedule_cpuset_propagate_hotplug(cs); 2258 if (!css_tryget(&cs->css))
2167 rcu_read_unlock(); 2259 continue;
2168 } 2260 rcu_read_unlock();
2169 2261
2170 mutex_unlock(&cpuset_mutex); 2262 cpuset_hotplug_update_tasks(cs);
2171 2263
2172 /* wait for propagations to finish */ 2264 rcu_read_lock();
2173 flush_workqueue(cpuset_propagate_hotplug_wq); 2265 css_put(&cs->css);
2266 }
2267 rcu_read_unlock();
2268 }
2174 2269
2175 /* rebuild sched domains if cpus_allowed has changed */ 2270 /* rebuild sched domains if cpus_allowed has changed */
2176 if (cpus_updated) 2271 if (cpus_updated)
@@ -2219,12 +2314,9 @@ void __init cpuset_init_smp(void)
2219{ 2314{
2220 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2315 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2221 top_cpuset.mems_allowed = node_states[N_MEMORY]; 2316 top_cpuset.mems_allowed = node_states[N_MEMORY];
2317 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
2222 2318
2223 register_hotmemory_notifier(&cpuset_track_online_nodes_nb); 2319 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2224
2225 cpuset_propagate_hotplug_wq =
2226 alloc_ordered_workqueue("cpuset_hotplug", 0);
2227 BUG_ON(!cpuset_propagate_hotplug_wq);
2228} 2320}
2229 2321
2230/** 2322/**
@@ -2240,21 +2332,23 @@ void __init cpuset_init_smp(void)
2240 2332
2241void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2333void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2242{ 2334{
2335 struct cpuset *cpus_cs;
2336
2243 mutex_lock(&callback_mutex); 2337 mutex_lock(&callback_mutex);
2244 task_lock(tsk); 2338 task_lock(tsk);
2245 guarantee_online_cpus(task_cs(tsk), pmask); 2339 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
2340 guarantee_online_cpus(cpus_cs, pmask);
2246 task_unlock(tsk); 2341 task_unlock(tsk);
2247 mutex_unlock(&callback_mutex); 2342 mutex_unlock(&callback_mutex);
2248} 2343}
2249 2344
2250void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2345void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2251{ 2346{
2252 const struct cpuset *cs; 2347 const struct cpuset *cpus_cs;
2253 2348
2254 rcu_read_lock(); 2349 rcu_read_lock();
2255 cs = task_cs(tsk); 2350 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
2256 if (cs) 2351 do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
2257 do_set_cpus_allowed(tsk, cs->cpus_allowed);
2258 rcu_read_unlock(); 2352 rcu_read_unlock();
2259 2353
2260 /* 2354 /*
@@ -2293,11 +2387,13 @@ void cpuset_init_current_mems_allowed(void)
2293 2387
2294nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 2388nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2295{ 2389{
2390 struct cpuset *mems_cs;
2296 nodemask_t mask; 2391 nodemask_t mask;
2297 2392
2298 mutex_lock(&callback_mutex); 2393 mutex_lock(&callback_mutex);
2299 task_lock(tsk); 2394 task_lock(tsk);
2300 guarantee_online_mems(task_cs(tsk), &mask); 2395 mems_cs = effective_nodemask_cpuset(task_cs(tsk));
2396 guarantee_online_mems(mems_cs, &mask);
2301 task_unlock(tsk); 2397 task_unlock(tsk);
2302 mutex_unlock(&callback_mutex); 2398 mutex_unlock(&callback_mutex);
2303 2399
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b391907d5352..f86599e8c123 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -165,10 +165,28 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
165/* 165/*
166 * max perf event sample rate 166 * max perf event sample rate
167 */ 167 */
168#define DEFAULT_MAX_SAMPLE_RATE 100000 168#define DEFAULT_MAX_SAMPLE_RATE 100000
169int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; 169#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
170static int max_samples_per_tick __read_mostly = 170#define DEFAULT_CPU_TIME_MAX_PERCENT 25
171 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); 171
172int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
173
174static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
175static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
176
177static atomic_t perf_sample_allowed_ns __read_mostly =
178 ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
179
180void update_perf_cpu_limits(void)
181{
182 u64 tmp = perf_sample_period_ns;
183
184 tmp *= sysctl_perf_cpu_time_max_percent;
185 do_div(tmp, 100);
186 atomic_set(&perf_sample_allowed_ns, tmp);
187}
188
189static int perf_rotate_context(struct perf_cpu_context *cpuctx);
172 190
173int perf_proc_update_handler(struct ctl_table *table, int write, 191int perf_proc_update_handler(struct ctl_table *table, int write,
174 void __user *buffer, size_t *lenp, 192 void __user *buffer, size_t *lenp,
@@ -180,10 +198,78 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
180 return ret; 198 return ret;
181 199
182 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); 200 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
201 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
202 update_perf_cpu_limits();
203
204 return 0;
205}
206
207int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
208
209int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
210 void __user *buffer, size_t *lenp,
211 loff_t *ppos)
212{
213 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
214
215 if (ret || !write)
216 return ret;
217
218 update_perf_cpu_limits();
183 219
184 return 0; 220 return 0;
185} 221}
186 222
223/*
224 * perf samples are done in some very critical code paths (NMIs).
225 * If they take too much CPU time, the system can lock up and not
226 * get any real work done. This will drop the sample rate when
227 * we detect that events are taking too long.
228 */
229#define NR_ACCUMULATED_SAMPLES 128
230DEFINE_PER_CPU(u64, running_sample_length);
231
232void perf_sample_event_took(u64 sample_len_ns)
233{
234 u64 avg_local_sample_len;
235 u64 local_samples_len;
236
237 if (atomic_read(&perf_sample_allowed_ns) == 0)
238 return;
239
240 /* decay the counter by 1 average sample */
241 local_samples_len = __get_cpu_var(running_sample_length);
242 local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
243 local_samples_len += sample_len_ns;
244 __get_cpu_var(running_sample_length) = local_samples_len;
245
246 /*
247 * note: this will be biased artifically low until we have
248 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
249 * from having to maintain a count.
250 */
251 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
252
253 if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
254 return;
255
256 if (max_samples_per_tick <= 1)
257 return;
258
259 max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
260 sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
261 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
262
263 printk_ratelimited(KERN_WARNING
264 "perf samples too long (%lld > %d), lowering "
265 "kernel.perf_event_max_sample_rate to %d\n",
266 avg_local_sample_len,
267 atomic_read(&perf_sample_allowed_ns),
268 sysctl_perf_event_sample_rate);
269
270 update_perf_cpu_limits();
271}
272
187static atomic64_t perf_event_id; 273static atomic64_t perf_event_id;
188 274
189static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 275static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
@@ -655,6 +741,106 @@ perf_cgroup_mark_enabled(struct perf_event *event,
655} 741}
656#endif 742#endif
657 743
744/*
745 * set default to be dependent on timer tick just
746 * like original code
747 */
748#define PERF_CPU_HRTIMER (1000 / HZ)
749/*
750 * function must be called with interrupts disbled
751 */
752static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
753{
754 struct perf_cpu_context *cpuctx;
755 enum hrtimer_restart ret = HRTIMER_NORESTART;
756 int rotations = 0;
757
758 WARN_ON(!irqs_disabled());
759
760 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
761
762 rotations = perf_rotate_context(cpuctx);
763
764 /*
765 * arm timer if needed
766 */
767 if (rotations) {
768 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
769 ret = HRTIMER_RESTART;
770 }
771
772 return ret;
773}
774
775/* CPU is going down */
776void perf_cpu_hrtimer_cancel(int cpu)
777{
778 struct perf_cpu_context *cpuctx;
779 struct pmu *pmu;
780 unsigned long flags;
781
782 if (WARN_ON(cpu != smp_processor_id()))
783 return;
784
785 local_irq_save(flags);
786
787 rcu_read_lock();
788
789 list_for_each_entry_rcu(pmu, &pmus, entry) {
790 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
791
792 if (pmu->task_ctx_nr == perf_sw_context)
793 continue;
794
795 hrtimer_cancel(&cpuctx->hrtimer);
796 }
797
798 rcu_read_unlock();
799
800 local_irq_restore(flags);
801}
802
803static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
804{
805 struct hrtimer *hr = &cpuctx->hrtimer;
806 struct pmu *pmu = cpuctx->ctx.pmu;
807 int timer;
808
809 /* no multiplexing needed for SW PMU */
810 if (pmu->task_ctx_nr == perf_sw_context)
811 return;
812
813 /*
814 * check default is sane, if not set then force to
815 * default interval (1/tick)
816 */
817 timer = pmu->hrtimer_interval_ms;
818 if (timer < 1)
819 timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
820
821 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
822
823 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
824 hr->function = perf_cpu_hrtimer_handler;
825}
826
827static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
828{
829 struct hrtimer *hr = &cpuctx->hrtimer;
830 struct pmu *pmu = cpuctx->ctx.pmu;
831
832 /* not for SW PMU */
833 if (pmu->task_ctx_nr == perf_sw_context)
834 return;
835
836 if (hrtimer_active(hr))
837 return;
838
839 if (!hrtimer_callback_running(hr))
840 __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
841 0, HRTIMER_MODE_REL_PINNED, 0);
842}
843
658void perf_pmu_disable(struct pmu *pmu) 844void perf_pmu_disable(struct pmu *pmu)
659{ 845{
660 int *count = this_cpu_ptr(pmu->pmu_disable_count); 846 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -761,8 +947,18 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
761{ 947{
762 struct perf_event_context *ctx; 948 struct perf_event_context *ctx;
763 949
764 rcu_read_lock();
765retry: 950retry:
951 /*
952 * One of the few rules of preemptible RCU is that one cannot do
953 * rcu_read_unlock() while holding a scheduler (or nested) lock when
954 * part of the read side critical section was preemptible -- see
955 * rcu_read_unlock_special().
956 *
957 * Since ctx->lock nests under rq->lock we must ensure the entire read
958 * side critical section is non-preemptible.
959 */
960 preempt_disable();
961 rcu_read_lock();
766 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); 962 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
767 if (ctx) { 963 if (ctx) {
768 /* 964 /*
@@ -778,6 +974,8 @@ retry:
778 raw_spin_lock_irqsave(&ctx->lock, *flags); 974 raw_spin_lock_irqsave(&ctx->lock, *flags);
779 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { 975 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
780 raw_spin_unlock_irqrestore(&ctx->lock, *flags); 976 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
977 rcu_read_unlock();
978 preempt_enable();
781 goto retry; 979 goto retry;
782 } 980 }
783 981
@@ -787,6 +985,7 @@ retry:
787 } 985 }
788 } 986 }
789 rcu_read_unlock(); 987 rcu_read_unlock();
988 preempt_enable();
790 return ctx; 989 return ctx;
791} 990}
792 991
@@ -1503,6 +1702,7 @@ group_sched_in(struct perf_event *group_event,
1503 1702
1504 if (event_sched_in(group_event, cpuctx, ctx)) { 1703 if (event_sched_in(group_event, cpuctx, ctx)) {
1505 pmu->cancel_txn(pmu); 1704 pmu->cancel_txn(pmu);
1705 perf_cpu_hrtimer_restart(cpuctx);
1506 return -EAGAIN; 1706 return -EAGAIN;
1507 } 1707 }
1508 1708
@@ -1549,6 +1749,8 @@ group_error:
1549 1749
1550 pmu->cancel_txn(pmu); 1750 pmu->cancel_txn(pmu);
1551 1751
1752 perf_cpu_hrtimer_restart(cpuctx);
1753
1552 return -EAGAIN; 1754 return -EAGAIN;
1553} 1755}
1554 1756
@@ -1761,7 +1963,16 @@ static int __perf_event_enable(void *info)
1761 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1963 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1762 int err; 1964 int err;
1763 1965
1764 if (WARN_ON_ONCE(!ctx->is_active)) 1966 /*
1967 * There's a time window between 'ctx->is_active' check
1968 * in perf_event_enable function and this place having:
1969 * - IRQs on
1970 * - ctx->lock unlocked
1971 *
1972 * where the task could be killed and 'ctx' deactivated
1973 * by perf_event_exit_task.
1974 */
1975 if (!ctx->is_active)
1765 return -EINVAL; 1976 return -EINVAL;
1766 1977
1767 raw_spin_lock(&ctx->lock); 1978 raw_spin_lock(&ctx->lock);
@@ -1804,8 +2015,10 @@ static int __perf_event_enable(void *info)
1804 * If this event can't go on and it's part of a 2015 * If this event can't go on and it's part of a
1805 * group, then the whole group has to come off. 2016 * group, then the whole group has to come off.
1806 */ 2017 */
1807 if (leader != event) 2018 if (leader != event) {
1808 group_sched_out(leader, cpuctx, ctx); 2019 group_sched_out(leader, cpuctx, ctx);
2020 perf_cpu_hrtimer_restart(cpuctx);
2021 }
1809 if (leader->attr.pinned) { 2022 if (leader->attr.pinned) {
1810 update_group_times(leader); 2023 update_group_times(leader);
1811 leader->state = PERF_EVENT_STATE_ERROR; 2024 leader->state = PERF_EVENT_STATE_ERROR;
@@ -2552,7 +2765,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
2552 * because they're strictly cpu affine and rotate_start is called with IRQs 2765 * because they're strictly cpu affine and rotate_start is called with IRQs
2553 * disabled, while rotate_context is called from IRQ context. 2766 * disabled, while rotate_context is called from IRQ context.
2554 */ 2767 */
2555static void perf_rotate_context(struct perf_cpu_context *cpuctx) 2768static int perf_rotate_context(struct perf_cpu_context *cpuctx)
2556{ 2769{
2557 struct perf_event_context *ctx = NULL; 2770 struct perf_event_context *ctx = NULL;
2558 int rotate = 0, remove = 1; 2771 int rotate = 0, remove = 1;
@@ -2591,6 +2804,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2591done: 2804done:
2592 if (remove) 2805 if (remove)
2593 list_del_init(&cpuctx->rotation_list); 2806 list_del_init(&cpuctx->rotation_list);
2807
2808 return rotate;
2594} 2809}
2595 2810
2596#ifdef CONFIG_NO_HZ_FULL 2811#ifdef CONFIG_NO_HZ_FULL
@@ -2622,10 +2837,6 @@ void perf_event_task_tick(void)
2622 ctx = cpuctx->task_ctx; 2837 ctx = cpuctx->task_ctx;
2623 if (ctx) 2838 if (ctx)
2624 perf_adjust_freq_unthr_context(ctx, throttled); 2839 perf_adjust_freq_unthr_context(ctx, throttled);
2625
2626 if (cpuctx->jiffies_interval == 1 ||
2627 !(jiffies % cpuctx->jiffies_interval))
2628 perf_rotate_context(cpuctx);
2629 } 2840 }
2630} 2841}
2631 2842
@@ -5036,7 +5247,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
5036 * sign as trigger. 5247 * sign as trigger.
5037 */ 5248 */
5038 5249
5039static u64 perf_swevent_set_period(struct perf_event *event) 5250u64 perf_swevent_set_period(struct perf_event *event)
5040{ 5251{
5041 struct hw_perf_event *hwc = &event->hw; 5252 struct hw_perf_event *hwc = &event->hw;
5042 u64 period = hwc->last_period; 5253 u64 period = hwc->last_period;
@@ -5979,9 +6190,54 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
5979 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); 6190 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5980} 6191}
5981 6192
6193static ssize_t
6194perf_event_mux_interval_ms_show(struct device *dev,
6195 struct device_attribute *attr,
6196 char *page)
6197{
6198 struct pmu *pmu = dev_get_drvdata(dev);
6199
6200 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
6201}
6202
6203static ssize_t
6204perf_event_mux_interval_ms_store(struct device *dev,
6205 struct device_attribute *attr,
6206 const char *buf, size_t count)
6207{
6208 struct pmu *pmu = dev_get_drvdata(dev);
6209 int timer, cpu, ret;
6210
6211 ret = kstrtoint(buf, 0, &timer);
6212 if (ret)
6213 return ret;
6214
6215 if (timer < 1)
6216 return -EINVAL;
6217
6218 /* same value, noting to do */
6219 if (timer == pmu->hrtimer_interval_ms)
6220 return count;
6221
6222 pmu->hrtimer_interval_ms = timer;
6223
6224 /* update all cpuctx for this PMU */
6225 for_each_possible_cpu(cpu) {
6226 struct perf_cpu_context *cpuctx;
6227 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6228 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
6229
6230 if (hrtimer_active(&cpuctx->hrtimer))
6231 hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
6232 }
6233
6234 return count;
6235}
6236
5982static struct device_attribute pmu_dev_attrs[] = { 6237static struct device_attribute pmu_dev_attrs[] = {
5983 __ATTR_RO(type), 6238 __ATTR_RO(type),
5984 __ATTR_NULL, 6239 __ATTR_RW(perf_event_mux_interval_ms),
6240 __ATTR_NULL,
5985}; 6241};
5986 6242
5987static int pmu_bus_running; 6243static int pmu_bus_running;
@@ -6027,7 +6283,7 @@ free_dev:
6027static struct lock_class_key cpuctx_mutex; 6283static struct lock_class_key cpuctx_mutex;
6028static struct lock_class_key cpuctx_lock; 6284static struct lock_class_key cpuctx_lock;
6029 6285
6030int perf_pmu_register(struct pmu *pmu, char *name, int type) 6286int perf_pmu_register(struct pmu *pmu, const char *name, int type)
6031{ 6287{
6032 int cpu, ret; 6288 int cpu, ret;
6033 6289
@@ -6076,7 +6332,9 @@ skip_type:
6076 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); 6332 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
6077 cpuctx->ctx.type = cpu_context; 6333 cpuctx->ctx.type = cpu_context;
6078 cpuctx->ctx.pmu = pmu; 6334 cpuctx->ctx.pmu = pmu;
6079 cpuctx->jiffies_interval = 1; 6335
6336 __perf_cpu_hrtimer_init(cpuctx, cpu);
6337
6080 INIT_LIST_HEAD(&cpuctx->rotation_list); 6338 INIT_LIST_HEAD(&cpuctx->rotation_list);
6081 cpuctx->unique_pmu = pmu; 6339 cpuctx->unique_pmu = pmu;
6082 } 6340 }
@@ -6402,11 +6660,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6402 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) 6660 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
6403 return -EINVAL; 6661 return -EINVAL;
6404 6662
6405 /* kernel level capture: check permissions */
6406 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
6407 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6408 return -EACCES;
6409
6410 /* propagate priv level, when not set for branch */ 6663 /* propagate priv level, when not set for branch */
6411 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { 6664 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
6412 6665
@@ -6424,6 +6677,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6424 */ 6677 */
6425 attr->branch_sample_type = mask; 6678 attr->branch_sample_type = mask;
6426 } 6679 }
6680 /* privileged levels capture (kernel, hv): check permissions */
6681 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
6682 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6683 return -EACCES;
6427 } 6684 }
6428 6685
6429 if (attr->sample_type & PERF_SAMPLE_REGS_USER) { 6686 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -7228,7 +7485,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
7228 * child. 7485 * child.
7229 */ 7486 */
7230 7487
7231 child_ctx = alloc_perf_context(event->pmu, child); 7488 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
7232 if (!child_ctx) 7489 if (!child_ctx)
7233 return -ENOMEM; 7490 return -ENOMEM;
7234 7491
@@ -7371,7 +7628,7 @@ static void __init perf_event_init_all_cpus(void)
7371 } 7628 }
7372} 7629}
7373 7630
7374static void __cpuinit perf_event_init_cpu(int cpu) 7631static void perf_event_init_cpu(int cpu)
7375{ 7632{
7376 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 7633 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7377 7634
@@ -7460,7 +7717,7 @@ static struct notifier_block perf_reboot_notifier = {
7460 .priority = INT_MIN, 7717 .priority = INT_MIN,
7461}; 7718};
7462 7719
7463static int __cpuinit 7720static int
7464perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) 7721perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
7465{ 7722{
7466 unsigned int cpu = (long)hcpu; 7723 unsigned int cpu = (long)hcpu;
@@ -7476,7 +7733,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
7476 case CPU_DOWN_PREPARE: 7733 case CPU_DOWN_PREPARE:
7477 perf_event_exit_cpu(cpu); 7734 perf_event_exit_cpu(cpu);
7478 break; 7735 break;
7479
7480 default: 7736 default:
7481 break; 7737 break;
7482 } 7738 }
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 20185ea64aa6..1559fb0b9296 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -46,23 +46,26 @@
46#include <linux/smp.h> 46#include <linux/smp.h>
47 47
48#include <linux/hw_breakpoint.h> 48#include <linux/hw_breakpoint.h>
49
50
51/* 49/*
52 * Constraints data 50 * Constraints data
53 */ 51 */
52struct bp_cpuinfo {
53 /* Number of pinned cpu breakpoints in a cpu */
54 unsigned int cpu_pinned;
55 /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
56 unsigned int *tsk_pinned;
57 /* Number of non-pinned cpu/task breakpoints in a cpu */
58 unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */
59};
54 60
55/* Number of pinned cpu breakpoints in a cpu */ 61static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);
56static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
57
58/* Number of pinned task breakpoints in a cpu */
59static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
60
61/* Number of non-pinned cpu/task breakpoints in a cpu */
62static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
63
64static int nr_slots[TYPE_MAX]; 62static int nr_slots[TYPE_MAX];
65 63
64static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
65{
66 return per_cpu_ptr(bp_cpuinfo + type, cpu);
67}
68
66/* Keep track of the breakpoints attached to tasks */ 69/* Keep track of the breakpoints attached to tasks */
67static LIST_HEAD(bp_task_head); 70static LIST_HEAD(bp_task_head);
68 71
@@ -96,8 +99,8 @@ static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
96 */ 99 */
97static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) 100static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
98{ 101{
102 unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
99 int i; 103 int i;
100 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
101 104
102 for (i = nr_slots[type] - 1; i >= 0; i--) { 105 for (i = nr_slots[type] - 1; i >= 0; i--) {
103 if (tsk_pinned[i] > 0) 106 if (tsk_pinned[i] > 0)
@@ -127,6 +130,13 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
127 return count; 130 return count;
128} 131}
129 132
133static const struct cpumask *cpumask_of_bp(struct perf_event *bp)
134{
135 if (bp->cpu >= 0)
136 return cpumask_of(bp->cpu);
137 return cpu_possible_mask;
138}
139
130/* 140/*
131 * Report the number of pinned/un-pinned breakpoints we have in 141 * Report the number of pinned/un-pinned breakpoints we have in
132 * a given cpu (cpu > -1) or in all of them (cpu = -1). 142 * a given cpu (cpu > -1) or in all of them (cpu = -1).
@@ -135,25 +145,15 @@ static void
135fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, 145fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
136 enum bp_type_idx type) 146 enum bp_type_idx type)
137{ 147{
138 int cpu = bp->cpu; 148 const struct cpumask *cpumask = cpumask_of_bp(bp);
139 struct task_struct *tsk = bp->hw.bp_target; 149 int cpu;
140
141 if (cpu >= 0) {
142 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
143 if (!tsk)
144 slots->pinned += max_task_bp_pinned(cpu, type);
145 else
146 slots->pinned += task_bp_pinned(cpu, bp, type);
147 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
148
149 return;
150 }
151 150
152 for_each_possible_cpu(cpu) { 151 for_each_cpu(cpu, cpumask) {
153 unsigned int nr; 152 struct bp_cpuinfo *info = get_bp_info(cpu, type);
153 int nr;
154 154
155 nr = per_cpu(nr_cpu_bp_pinned[type], cpu); 155 nr = info->cpu_pinned;
156 if (!tsk) 156 if (!bp->hw.bp_target)
157 nr += max_task_bp_pinned(cpu, type); 157 nr += max_task_bp_pinned(cpu, type);
158 else 158 else
159 nr += task_bp_pinned(cpu, bp, type); 159 nr += task_bp_pinned(cpu, bp, type);
@@ -161,8 +161,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
161 if (nr > slots->pinned) 161 if (nr > slots->pinned)
162 slots->pinned = nr; 162 slots->pinned = nr;
163 163
164 nr = per_cpu(nr_bp_flexible[type], cpu); 164 nr = info->flexible;
165
166 if (nr > slots->flexible) 165 if (nr > slots->flexible)
167 slots->flexible = nr; 166 slots->flexible = nr;
168 } 167 }
@@ -182,29 +181,19 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight)
182/* 181/*
183 * Add a pinned breakpoint for the given task in our constraint table 182 * Add a pinned breakpoint for the given task in our constraint table
184 */ 183 */
185static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, 184static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
186 enum bp_type_idx type, int weight) 185 enum bp_type_idx type, int weight)
187{ 186{
188 unsigned int *tsk_pinned; 187 unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
189 int old_count = 0; 188 int old_idx, new_idx;
190 int old_idx = 0; 189
191 int idx = 0; 190 old_idx = task_bp_pinned(cpu, bp, type) - 1;
192 191 new_idx = old_idx + weight;
193 old_count = task_bp_pinned(cpu, bp, type); 192
194 old_idx = old_count - 1; 193 if (old_idx >= 0)
195 idx = old_idx + weight; 194 tsk_pinned[old_idx]--;
196 195 if (new_idx >= 0)
197 /* tsk_pinned[n] is the number of tasks having n breakpoints */ 196 tsk_pinned[new_idx]++;
198 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
199 if (enable) {
200 tsk_pinned[idx]++;
201 if (old_count > 0)
202 tsk_pinned[old_idx]--;
203 } else {
204 tsk_pinned[idx]--;
205 if (old_count > 0)
206 tsk_pinned[old_idx]++;
207 }
208} 197}
209 198
210/* 199/*
@@ -214,33 +203,26 @@ static void
214toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, 203toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
215 int weight) 204 int weight)
216{ 205{
217 int cpu = bp->cpu; 206 const struct cpumask *cpumask = cpumask_of_bp(bp);
218 struct task_struct *tsk = bp->hw.bp_target; 207 int cpu;
219 208
220 /* Pinned counter cpu profiling */ 209 if (!enable)
221 if (!tsk) { 210 weight = -weight;
222 211
223 if (enable) 212 /* Pinned counter cpu profiling */
224 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; 213 if (!bp->hw.bp_target) {
225 else 214 get_bp_info(bp->cpu, type)->cpu_pinned += weight;
226 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
227 return; 215 return;
228 } 216 }
229 217
230 /* Pinned counter task profiling */ 218 /* Pinned counter task profiling */
231 219 for_each_cpu(cpu, cpumask)
232 if (!enable) 220 toggle_bp_task_slot(bp, cpu, type, weight);
233 list_del(&bp->hw.bp_list);
234
235 if (cpu >= 0) {
236 toggle_bp_task_slot(bp, cpu, enable, type, weight);
237 } else {
238 for_each_possible_cpu(cpu)
239 toggle_bp_task_slot(bp, cpu, enable, type, weight);
240 }
241 221
242 if (enable) 222 if (enable)
243 list_add_tail(&bp->hw.bp_list, &bp_task_head); 223 list_add_tail(&bp->hw.bp_list, &bp_task_head);
224 else
225 list_del(&bp->hw.bp_list);
244} 226}
245 227
246/* 228/*
@@ -261,8 +243,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
261 * 243 *
262 * - If attached to a single cpu, check: 244 * - If attached to a single cpu, check:
263 * 245 *
264 * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) 246 * (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu)
265 * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM 247 * + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM
266 * 248 *
267 * -> If there are already non-pinned counters in this cpu, it means 249 * -> If there are already non-pinned counters in this cpu, it means
268 * there is already a free slot for them. 250 * there is already a free slot for them.
@@ -272,8 +254,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
272 * 254 *
273 * - If attached to every cpus, check: 255 * - If attached to every cpus, check:
274 * 256 *
275 * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) 257 * (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *))
276 * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM 258 * + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM
277 * 259 *
278 * -> This is roughly the same, except we check the number of per cpu 260 * -> This is roughly the same, except we check the number of per cpu
279 * bp for every cpu and we keep the max one. Same for the per tasks 261 * bp for every cpu and we keep the max one. Same for the per tasks
@@ -284,16 +266,16 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
284 * 266 *
285 * - If attached to a single cpu, check: 267 * - If attached to a single cpu, check:
286 * 268 *
287 * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) 269 * ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu)
288 * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM 270 * + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM
289 * 271 *
290 * -> Same checks as before. But now the nr_bp_flexible, if any, must keep 272 * -> Same checks as before. But now the info->flexible, if any, must keep
291 * one register at least (or they will never be fed). 273 * one register at least (or they will never be fed).
292 * 274 *
293 * - If attached to every cpus, check: 275 * - If attached to every cpus, check:
294 * 276 *
295 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) 277 * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *))
296 * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM 278 * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM
297 */ 279 */
298static int __reserve_bp_slot(struct perf_event *bp) 280static int __reserve_bp_slot(struct perf_event *bp)
299{ 281{
@@ -518,8 +500,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
518 perf_overflow_handler_t triggered, 500 perf_overflow_handler_t triggered,
519 void *context) 501 void *context)
520{ 502{
521 struct perf_event * __percpu *cpu_events, **pevent, *bp; 503 struct perf_event * __percpu *cpu_events, *bp;
522 long err; 504 long err = 0;
523 int cpu; 505 int cpu;
524 506
525 cpu_events = alloc_percpu(typeof(*cpu_events)); 507 cpu_events = alloc_percpu(typeof(*cpu_events));
@@ -528,31 +510,21 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
528 510
529 get_online_cpus(); 511 get_online_cpus();
530 for_each_online_cpu(cpu) { 512 for_each_online_cpu(cpu) {
531 pevent = per_cpu_ptr(cpu_events, cpu);
532 bp = perf_event_create_kernel_counter(attr, cpu, NULL, 513 bp = perf_event_create_kernel_counter(attr, cpu, NULL,
533 triggered, context); 514 triggered, context);
534
535 *pevent = bp;
536
537 if (IS_ERR(bp)) { 515 if (IS_ERR(bp)) {
538 err = PTR_ERR(bp); 516 err = PTR_ERR(bp);
539 goto fail; 517 break;
540 } 518 }
541 }
542 put_online_cpus();
543 519
544 return cpu_events; 520 per_cpu(*cpu_events, cpu) = bp;
545
546fail:
547 for_each_online_cpu(cpu) {
548 pevent = per_cpu_ptr(cpu_events, cpu);
549 if (IS_ERR(*pevent))
550 break;
551 unregister_hw_breakpoint(*pevent);
552 } 521 }
553 put_online_cpus(); 522 put_online_cpus();
554 523
555 free_percpu(cpu_events); 524 if (likely(!err))
525 return cpu_events;
526
527 unregister_wide_hw_breakpoint(cpu_events);
556 return (void __percpu __force *)ERR_PTR(err); 528 return (void __percpu __force *)ERR_PTR(err);
557} 529}
558EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); 530EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
@@ -564,12 +536,10 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
564void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) 536void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
565{ 537{
566 int cpu; 538 int cpu;
567 struct perf_event **pevent;
568 539
569 for_each_possible_cpu(cpu) { 540 for_each_possible_cpu(cpu)
570 pevent = per_cpu_ptr(cpu_events, cpu); 541 unregister_hw_breakpoint(per_cpu(*cpu_events, cpu));
571 unregister_hw_breakpoint(*pevent); 542
572 }
573 free_percpu(cpu_events); 543 free_percpu(cpu_events);
574} 544}
575EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); 545EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
@@ -612,6 +582,11 @@ static int hw_breakpoint_add(struct perf_event *bp, int flags)
612 if (!(flags & PERF_EF_START)) 582 if (!(flags & PERF_EF_START))
613 bp->hw.state = PERF_HES_STOPPED; 583 bp->hw.state = PERF_HES_STOPPED;
614 584
585 if (is_sampling_event(bp)) {
586 bp->hw.last_period = bp->hw.sample_period;
587 perf_swevent_set_period(bp);
588 }
589
615 return arch_install_hw_breakpoint(bp); 590 return arch_install_hw_breakpoint(bp);
616} 591}
617 592
@@ -650,7 +625,6 @@ static struct pmu perf_breakpoint = {
650 625
651int __init init_hw_breakpoint(void) 626int __init init_hw_breakpoint(void)
652{ 627{
653 unsigned int **task_bp_pinned;
654 int cpu, err_cpu; 628 int cpu, err_cpu;
655 int i; 629 int i;
656 630
@@ -659,10 +633,11 @@ int __init init_hw_breakpoint(void)
659 633
660 for_each_possible_cpu(cpu) { 634 for_each_possible_cpu(cpu) {
661 for (i = 0; i < TYPE_MAX; i++) { 635 for (i = 0; i < TYPE_MAX; i++) {
662 task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu); 636 struct bp_cpuinfo *info = get_bp_info(cpu, i);
663 *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i], 637
664 GFP_KERNEL); 638 info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
665 if (!*task_bp_pinned) 639 GFP_KERNEL);
640 if (!info->tsk_pinned)
666 goto err_alloc; 641 goto err_alloc;
667 } 642 }
668 } 643 }
@@ -676,7 +651,7 @@ int __init init_hw_breakpoint(void)
676 err_alloc: 651 err_alloc:
677 for_each_possible_cpu(err_cpu) { 652 for_each_possible_cpu(err_cpu) {
678 for (i = 0; i < TYPE_MAX; i++) 653 for (i = 0; i < TYPE_MAX; i++)
679 kfree(per_cpu(nr_task_bp_pinned[i], err_cpu)); 654 kfree(get_bp_info(err_cpu, i)->tsk_pinned);
680 if (err_cpu == cpu) 655 if (err_cpu == cpu)
681 break; 656 break;
682 } 657 }
diff --git a/kernel/exit.c b/kernel/exit.c
index 7bb73f9d09db..a949819055d5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -312,17 +312,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
312 } 312 }
313} 313}
314 314
315void __set_special_pids(struct pid *pid)
316{
317 struct task_struct *curr = current->group_leader;
318
319 if (task_session(curr) != pid)
320 change_pid(curr, PIDTYPE_SID, pid);
321
322 if (task_pgrp(curr) != pid)
323 change_pid(curr, PIDTYPE_PGID, pid);
324}
325
326/* 315/*
327 * Let kernel threads use this to say that they allow a certain signal. 316 * Let kernel threads use this to say that they allow a certain signal.
328 * Must not be used if kthread was cloned with CLONE_SIGHAND. 317 * Must not be used if kthread was cloned with CLONE_SIGHAND.
@@ -819,7 +808,7 @@ void do_exit(long code)
819 /* 808 /*
820 * FIXME: do that only when needed, using sched_exit tracepoint 809 * FIXME: do that only when needed, using sched_exit tracepoint
821 */ 810 */
822 ptrace_put_breakpoints(tsk); 811 flush_ptrace_hw_breakpoint(tsk);
823 812
824 exit_notify(tsk, group_dead); 813 exit_notify(tsk, group_dead);
825#ifdef CONFIG_NUMA 814#ifdef CONFIG_NUMA
@@ -835,7 +824,7 @@ void do_exit(long code)
835 /* 824 /*
836 * Make sure we are holding no locks: 825 * Make sure we are holding no locks:
837 */ 826 */
838 debug_check_no_locks_held(tsk); 827 debug_check_no_locks_held();
839 /* 828 /*
840 * We can do this unlocked here. The futex code uses this flag 829 * We can do this unlocked here. The futex code uses this flag
841 * just to verify whether the pi state cleanup has been done 830 * just to verify whether the pi state cleanup has been done
diff --git a/kernel/fork.c b/kernel/fork.c
index 987b28a1f01b..e23bb19e2a3e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -365,8 +365,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
365 mm->locked_vm = 0; 365 mm->locked_vm = 0;
366 mm->mmap = NULL; 366 mm->mmap = NULL;
367 mm->mmap_cache = NULL; 367 mm->mmap_cache = NULL;
368 mm->free_area_cache = oldmm->mmap_base;
369 mm->cached_hole_size = ~0UL;
370 mm->map_count = 0; 368 mm->map_count = 0;
371 cpumask_clear(mm_cpumask(mm)); 369 cpumask_clear(mm_cpumask(mm));
372 mm->mm_rb = RB_ROOT; 370 mm->mm_rb = RB_ROOT;
@@ -540,8 +538,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
540 mm->nr_ptes = 0; 538 mm->nr_ptes = 0;
541 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); 539 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
542 spin_lock_init(&mm->page_table_lock); 540 spin_lock_init(&mm->page_table_lock);
543 mm->free_area_cache = TASK_UNMAPPED_BASE;
544 mm->cached_hole_size = ~0UL;
545 mm_init_aio(mm); 541 mm_init_aio(mm);
546 mm_init_owner(mm, p); 542 mm_init_owner(mm, p);
547 543
@@ -1121,6 +1117,12 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
1121 INIT_LIST_HEAD(&tsk->cpu_timers[2]); 1117 INIT_LIST_HEAD(&tsk->cpu_timers[2]);
1122} 1118}
1123 1119
1120static inline void
1121init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
1122{
1123 task->pids[type].pid = pid;
1124}
1125
1124/* 1126/*
1125 * This creates a new process as a copy of the old one, 1127 * This creates a new process as a copy of the old one,
1126 * but does not actually start it yet. 1128 * but does not actually start it yet.
@@ -1199,8 +1201,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1199 retval = -EAGAIN; 1201 retval = -EAGAIN;
1200 if (atomic_read(&p->real_cred->user->processes) >= 1202 if (atomic_read(&p->real_cred->user->processes) >=
1201 task_rlimit(p, RLIMIT_NPROC)) { 1203 task_rlimit(p, RLIMIT_NPROC)) {
1202 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 1204 if (p->real_cred->user != INIT_USER &&
1203 p->real_cred->user != INIT_USER) 1205 !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
1204 goto bad_fork_free; 1206 goto bad_fork_free;
1205 } 1207 }
1206 current->flags &= ~PF_NPROC_EXCEEDED; 1208 current->flags &= ~PF_NPROC_EXCEEDED;
@@ -1354,11 +1356,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1354 goto bad_fork_cleanup_io; 1356 goto bad_fork_cleanup_io;
1355 } 1357 }
1356 1358
1357 p->pid = pid_nr(pid);
1358 p->tgid = p->pid;
1359 if (clone_flags & CLONE_THREAD)
1360 p->tgid = current->tgid;
1361
1362 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1359 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1363 /* 1360 /*
1364 * Clear TID on mm_release()? 1361 * Clear TID on mm_release()?
@@ -1394,12 +1391,19 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1394 clear_all_latency_tracing(p); 1391 clear_all_latency_tracing(p);
1395 1392
1396 /* ok, now we should be set up.. */ 1393 /* ok, now we should be set up.. */
1397 if (clone_flags & CLONE_THREAD) 1394 p->pid = pid_nr(pid);
1395 if (clone_flags & CLONE_THREAD) {
1398 p->exit_signal = -1; 1396 p->exit_signal = -1;
1399 else if (clone_flags & CLONE_PARENT) 1397 p->group_leader = current->group_leader;
1400 p->exit_signal = current->group_leader->exit_signal; 1398 p->tgid = current->tgid;
1401 else 1399 } else {
1402 p->exit_signal = (clone_flags & CSIGNAL); 1400 if (clone_flags & CLONE_PARENT)
1401 p->exit_signal = current->group_leader->exit_signal;
1402 else
1403 p->exit_signal = (clone_flags & CSIGNAL);
1404 p->group_leader = p;
1405 p->tgid = p->pid;
1406 }
1403 1407
1404 p->pdeath_signal = 0; 1408 p->pdeath_signal = 0;
1405 p->exit_state = 0; 1409 p->exit_state = 0;
@@ -1408,15 +1412,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1408 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); 1412 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
1409 p->dirty_paused_when = 0; 1413 p->dirty_paused_when = 0;
1410 1414
1411 /*
1412 * Ok, make it visible to the rest of the system.
1413 * We dont wake it up yet.
1414 */
1415 p->group_leader = p;
1416 INIT_LIST_HEAD(&p->thread_group); 1415 INIT_LIST_HEAD(&p->thread_group);
1417 p->task_works = NULL; 1416 p->task_works = NULL;
1418 1417
1419 /* Need tasklist lock for parent etc handling! */ 1418 /*
1419 * Make it visible to the rest of the system, but dont wake it up yet.
1420 * Need tasklist lock for parent etc handling!
1421 */
1420 write_lock_irq(&tasklist_lock); 1422 write_lock_irq(&tasklist_lock);
1421 1423
1422 /* CLONE_PARENT re-uses the old parent */ 1424 /* CLONE_PARENT re-uses the old parent */
@@ -1446,18 +1448,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1446 goto bad_fork_free_pid; 1448 goto bad_fork_free_pid;
1447 } 1449 }
1448 1450
1449 if (clone_flags & CLONE_THREAD) {
1450 current->signal->nr_threads++;
1451 atomic_inc(&current->signal->live);
1452 atomic_inc(&current->signal->sigcnt);
1453 p->group_leader = current->group_leader;
1454 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1455 }
1456
1457 if (likely(p->pid)) { 1451 if (likely(p->pid)) {
1458 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); 1452 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
1459 1453
1454 init_task_pid(p, PIDTYPE_PID, pid);
1460 if (thread_group_leader(p)) { 1455 if (thread_group_leader(p)) {
1456 init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
1457 init_task_pid(p, PIDTYPE_SID, task_session(current));
1458
1461 if (is_child_reaper(pid)) { 1459 if (is_child_reaper(pid)) {
1462 ns_of_pid(pid)->child_reaper = p; 1460 ns_of_pid(pid)->child_reaper = p;
1463 p->signal->flags |= SIGNAL_UNKILLABLE; 1461 p->signal->flags |= SIGNAL_UNKILLABLE;
@@ -1465,13 +1463,19 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1465 1463
1466 p->signal->leader_pid = pid; 1464 p->signal->leader_pid = pid;
1467 p->signal->tty = tty_kref_get(current->signal->tty); 1465 p->signal->tty = tty_kref_get(current->signal->tty);
1468 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1469 attach_pid(p, PIDTYPE_SID, task_session(current));
1470 list_add_tail(&p->sibling, &p->real_parent->children); 1466 list_add_tail(&p->sibling, &p->real_parent->children);
1471 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1467 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1468 attach_pid(p, PIDTYPE_PGID);
1469 attach_pid(p, PIDTYPE_SID);
1472 __this_cpu_inc(process_counts); 1470 __this_cpu_inc(process_counts);
1471 } else {
1472 current->signal->nr_threads++;
1473 atomic_inc(&current->signal->live);
1474 atomic_inc(&current->signal->sigcnt);
1475 list_add_tail_rcu(&p->thread_group,
1476 &p->group_leader->thread_group);
1473 } 1477 }
1474 attach_pid(p, PIDTYPE_PID, pid); 1478 attach_pid(p, PIDTYPE_PID);
1475 nr_threads++; 1479 nr_threads++;
1476 } 1480 }
1477 1481
@@ -1542,7 +1546,7 @@ static inline void init_idle_pids(struct pid_link *links)
1542 } 1546 }
1543} 1547}
1544 1548
1545struct task_struct * __cpuinit fork_idle(int cpu) 1549struct task_struct *fork_idle(int cpu)
1546{ 1550{
1547 struct task_struct *task; 1551 struct task_struct *task;
1548 task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); 1552 task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
@@ -1675,6 +1679,12 @@ SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
1675 int __user *, parent_tidptr, 1679 int __user *, parent_tidptr,
1676 int __user *, child_tidptr, 1680 int __user *, child_tidptr,
1677 int, tls_val) 1681 int, tls_val)
1682#elif defined(CONFIG_CLONE_BACKWARDS3)
1683SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
1684 int, stack_size,
1685 int __user *, parent_tidptr,
1686 int __user *, child_tidptr,
1687 int, tls_val)
1678#else 1688#else
1679SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, 1689SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
1680 int __user *, parent_tidptr, 1690 int __user *, parent_tidptr,
diff --git a/kernel/freezer.c b/kernel/freezer.c
index c38893b0efba..b462fa197517 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -33,7 +33,7 @@ static DEFINE_SPINLOCK(freezer_lock);
33 */ 33 */
34bool freezing_slow_path(struct task_struct *p) 34bool freezing_slow_path(struct task_struct *p)
35{ 35{
36 if (p->flags & PF_NOFREEZE) 36 if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
37 return false; 37 return false;
38 38
39 if (pm_nosig_freezing || cgroup_freezing(p)) 39 if (pm_nosig_freezing || cgroup_freezing(p))
@@ -110,6 +110,18 @@ bool freeze_task(struct task_struct *p)
110{ 110{
111 unsigned long flags; 111 unsigned long flags;
112 112
113 /*
114 * This check can race with freezer_do_not_count, but worst case that
115 * will result in an extra wakeup being sent to the task. It does not
116 * race with freezer_count(), the barriers in freezer_count() and
117 * freezer_should_skip() ensure that either freezer_count() sees
118 * freezing == true in try_to_freeze() and freezes, or
119 * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task
120 * normally.
121 */
122 if (freezer_should_skip(p))
123 return false;
124
113 spin_lock_irqsave(&freezer_lock, flags); 125 spin_lock_irqsave(&freezer_lock, flags);
114 if (!freezing(p) || frozen(p)) { 126 if (!freezing(p) || frozen(p)) {
115 spin_unlock_irqrestore(&freezer_lock, flags); 127 spin_unlock_irqrestore(&freezer_lock, flags);
diff --git a/kernel/futex.c b/kernel/futex.c
index b26dcfc02c94..c3a1a55a5214 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -61,6 +61,8 @@
61#include <linux/nsproxy.h> 61#include <linux/nsproxy.h>
62#include <linux/ptrace.h> 62#include <linux/ptrace.h>
63#include <linux/sched/rt.h> 63#include <linux/sched/rt.h>
64#include <linux/hugetlb.h>
65#include <linux/freezer.h>
64 66
65#include <asm/futex.h> 67#include <asm/futex.h>
66 68
@@ -365,7 +367,7 @@ again:
365 } else { 367 } else {
366 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 368 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
367 key->shared.inode = page_head->mapping->host; 369 key->shared.inode = page_head->mapping->host;
368 key->shared.pgoff = page_head->index; 370 key->shared.pgoff = basepage_index(page);
369 } 371 }
370 372
371 get_futex_key_refs(key); 373 get_futex_key_refs(key);
@@ -1807,7 +1809,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1807 * is no timeout, or if it has yet to expire. 1809 * is no timeout, or if it has yet to expire.
1808 */ 1810 */
1809 if (!timeout || timeout->task) 1811 if (!timeout || timeout->task)
1810 schedule(); 1812 freezable_schedule();
1811 } 1813 }
1812 __set_current_state(TASK_RUNNING); 1814 __set_current_state(TASK_RUNNING);
1813} 1815}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index fd4b13b131f8..383319bae3f7 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -47,6 +47,7 @@
47#include <linux/sched/sysctl.h> 47#include <linux/sched/sysctl.h>
48#include <linux/sched/rt.h> 48#include <linux/sched/rt.h>
49#include <linux/timer.h> 49#include <linux/timer.h>
50#include <linux/freezer.h>
50 51
51#include <asm/uaccess.h> 52#include <asm/uaccess.h>
52 53
@@ -721,17 +722,20 @@ static int hrtimer_switch_to_hres(void)
721 return 1; 722 return 1;
722} 723}
723 724
725static void clock_was_set_work(struct work_struct *work)
726{
727 clock_was_set();
728}
729
730static DECLARE_WORK(hrtimer_work, clock_was_set_work);
731
724/* 732/*
725 * Called from timekeeping code to reprogramm the hrtimer interrupt 733 * Called from timekeeping and resume code to reprogramm the hrtimer
726 * device. If called from the timer interrupt context we defer it to 734 * interrupt device on all cpus.
727 * softirq context.
728 */ 735 */
729void clock_was_set_delayed(void) 736void clock_was_set_delayed(void)
730{ 737{
731 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 738 schedule_work(&hrtimer_work);
732
733 cpu_base->clock_was_set = 1;
734 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
735} 739}
736 740
737#else 741#else
@@ -773,15 +777,19 @@ void clock_was_set(void)
773 777
774/* 778/*
775 * During resume we might have to reprogram the high resolution timer 779 * During resume we might have to reprogram the high resolution timer
776 * interrupt (on the local CPU): 780 * interrupt on all online CPUs. However, all other CPUs will be
781 * stopped with IRQs interrupts disabled so the clock_was_set() call
782 * must be deferred.
777 */ 783 */
778void hrtimers_resume(void) 784void hrtimers_resume(void)
779{ 785{
780 WARN_ONCE(!irqs_disabled(), 786 WARN_ONCE(!irqs_disabled(),
781 KERN_INFO "hrtimers_resume() called with IRQs enabled!"); 787 KERN_INFO "hrtimers_resume() called with IRQs enabled!");
782 788
789 /* Retrigger on the local CPU */
783 retrigger_next_event(NULL); 790 retrigger_next_event(NULL);
784 timerfd_clock_was_set(); 791 /* And schedule a retrigger for all others */
792 clock_was_set_delayed();
785} 793}
786 794
787static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) 795static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
@@ -1432,13 +1440,6 @@ void hrtimer_peek_ahead_timers(void)
1432 1440
1433static void run_hrtimer_softirq(struct softirq_action *h) 1441static void run_hrtimer_softirq(struct softirq_action *h)
1434{ 1442{
1435 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1436
1437 if (cpu_base->clock_was_set) {
1438 cpu_base->clock_was_set = 0;
1439 clock_was_set();
1440 }
1441
1442 hrtimer_peek_ahead_timers(); 1443 hrtimer_peek_ahead_timers();
1443} 1444}
1444 1445
@@ -1545,7 +1546,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
1545 t->task = NULL; 1546 t->task = NULL;
1546 1547
1547 if (likely(t->task)) 1548 if (likely(t->task))
1548 schedule(); 1549 freezable_schedule();
1549 1550
1550 hrtimer_cancel(&t->timer); 1551 hrtimer_cancel(&t->timer);
1551 mode = HRTIMER_MODE_ABS; 1552 mode = HRTIMER_MODE_ABS;
@@ -1658,7 +1659,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
1658/* 1659/*
1659 * Functions related to boot-time initialization: 1660 * Functions related to boot-time initialization:
1660 */ 1661 */
1661static void __cpuinit init_hrtimers_cpu(int cpu) 1662static void init_hrtimers_cpu(int cpu)
1662{ 1663{
1663 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 1664 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1664 int i; 1665 int i;
@@ -1739,7 +1740,7 @@ static void migrate_hrtimers(int scpu)
1739 1740
1740#endif /* CONFIG_HOTPLUG_CPU */ 1741#endif /* CONFIG_HOTPLUG_CPU */
1741 1742
1742static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, 1743static int hrtimer_cpu_notify(struct notifier_block *self,
1743 unsigned long action, void *hcpu) 1744 unsigned long action, void *hcpu)
1744{ 1745{
1745 int scpu = (long)hcpu; 1746 int scpu = (long)hcpu;
@@ -1772,7 +1773,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1772 return NOTIFY_OK; 1773 return NOTIFY_OK;
1773} 1774}
1774 1775
1775static struct notifier_block __cpuinitdata hrtimers_nb = { 1776static struct notifier_block hrtimers_nb = {
1776 .notifier_call = hrtimer_cpu_notify, 1777 .notifier_call = hrtimer_cpu_notify,
1777}; 1778};
1778 1779
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index cbd97ce0b000..a3bb14fbe5c6 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -213,6 +213,19 @@ void irq_enable(struct irq_desc *desc)
213 irq_state_clr_masked(desc); 213 irq_state_clr_masked(desc);
214} 214}
215 215
216/**
217 * irq_disable - Mark interupt disabled
218 * @desc: irq descriptor which should be disabled
219 *
220 * If the chip does not implement the irq_disable callback, we
221 * use a lazy disable approach. That means we mark the interrupt
222 * disabled, but leave the hardware unmasked. That's an
223 * optimization because we avoid the hardware access for the
224 * common case where no interrupt happens after we marked it
225 * disabled. If an interrupt happens, then the interrupt flow
226 * handler masks the line at the hardware level and marks it
227 * pending.
228 */
216void irq_disable(struct irq_desc *desc) 229void irq_disable(struct irq_desc *desc)
217{ 230{
218 irq_state_set_disabled(desc); 231 irq_state_set_disabled(desc);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c89295a8f668..452d6f2ba21d 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -7,6 +7,7 @@
7#include <linux/irq.h> 7#include <linux/irq.h>
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/export.h> 9#include <linux/export.h>
10#include <linux/irqdomain.h>
10#include <linux/interrupt.h> 11#include <linux/interrupt.h>
11#include <linux/kernel_stat.h> 12#include <linux/kernel_stat.h>
12#include <linux/syscore_ops.h> 13#include <linux/syscore_ops.h>
@@ -16,11 +17,6 @@
16static LIST_HEAD(gc_list); 17static LIST_HEAD(gc_list);
17static DEFINE_RAW_SPINLOCK(gc_lock); 18static DEFINE_RAW_SPINLOCK(gc_lock);
18 19
19static inline struct irq_chip_regs *cur_regs(struct irq_data *d)
20{
21 return &container_of(d->chip, struct irq_chip_type, chip)->regs;
22}
23
24/** 20/**
25 * irq_gc_noop - NOOP function 21 * irq_gc_noop - NOOP function
26 * @d: irq_data 22 * @d: irq_data
@@ -39,16 +35,17 @@ void irq_gc_noop(struct irq_data *d)
39void irq_gc_mask_disable_reg(struct irq_data *d) 35void irq_gc_mask_disable_reg(struct irq_data *d)
40{ 36{
41 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 37 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
42 u32 mask = 1 << (d->irq - gc->irq_base); 38 struct irq_chip_type *ct = irq_data_get_chip_type(d);
39 u32 mask = d->mask;
43 40
44 irq_gc_lock(gc); 41 irq_gc_lock(gc);
45 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable); 42 irq_reg_writel(mask, gc->reg_base + ct->regs.disable);
46 gc->mask_cache &= ~mask; 43 *ct->mask_cache &= ~mask;
47 irq_gc_unlock(gc); 44 irq_gc_unlock(gc);
48} 45}
49 46
50/** 47/**
51 * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register 48 * irq_gc_mask_set_bit - Mask chip via setting bit in mask register
52 * @d: irq_data 49 * @d: irq_data
53 * 50 *
54 * Chip has a single mask register. Values of this register are cached 51 * Chip has a single mask register. Values of this register are cached
@@ -57,16 +54,18 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
57void irq_gc_mask_set_bit(struct irq_data *d) 54void irq_gc_mask_set_bit(struct irq_data *d)
58{ 55{
59 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 56 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
60 u32 mask = 1 << (d->irq - gc->irq_base); 57 struct irq_chip_type *ct = irq_data_get_chip_type(d);
58 u32 mask = d->mask;
61 59
62 irq_gc_lock(gc); 60 irq_gc_lock(gc);
63 gc->mask_cache |= mask; 61 *ct->mask_cache |= mask;
64 irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); 62 irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
65 irq_gc_unlock(gc); 63 irq_gc_unlock(gc);
66} 64}
65EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit);
67 66
68/** 67/**
69 * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register 68 * irq_gc_mask_clr_bit - Mask chip via clearing bit in mask register
70 * @d: irq_data 69 * @d: irq_data
71 * 70 *
72 * Chip has a single mask register. Values of this register are cached 71 * Chip has a single mask register. Values of this register are cached
@@ -75,13 +74,15 @@ void irq_gc_mask_set_bit(struct irq_data *d)
75void irq_gc_mask_clr_bit(struct irq_data *d) 74void irq_gc_mask_clr_bit(struct irq_data *d)
76{ 75{
77 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 76 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
78 u32 mask = 1 << (d->irq - gc->irq_base); 77 struct irq_chip_type *ct = irq_data_get_chip_type(d);
78 u32 mask = d->mask;
79 79
80 irq_gc_lock(gc); 80 irq_gc_lock(gc);
81 gc->mask_cache &= ~mask; 81 *ct->mask_cache &= ~mask;
82 irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); 82 irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
83 irq_gc_unlock(gc); 83 irq_gc_unlock(gc);
84} 84}
85EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit);
85 86
86/** 87/**
87 * irq_gc_unmask_enable_reg - Unmask chip via enable register 88 * irq_gc_unmask_enable_reg - Unmask chip via enable register
@@ -93,11 +94,12 @@ void irq_gc_mask_clr_bit(struct irq_data *d)
93void irq_gc_unmask_enable_reg(struct irq_data *d) 94void irq_gc_unmask_enable_reg(struct irq_data *d)
94{ 95{
95 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 96 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
96 u32 mask = 1 << (d->irq - gc->irq_base); 97 struct irq_chip_type *ct = irq_data_get_chip_type(d);
98 u32 mask = d->mask;
97 99
98 irq_gc_lock(gc); 100 irq_gc_lock(gc);
99 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable); 101 irq_reg_writel(mask, gc->reg_base + ct->regs.enable);
100 gc->mask_cache |= mask; 102 *ct->mask_cache |= mask;
101 irq_gc_unlock(gc); 103 irq_gc_unlock(gc);
102} 104}
103 105
@@ -108,12 +110,14 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
108void irq_gc_ack_set_bit(struct irq_data *d) 110void irq_gc_ack_set_bit(struct irq_data *d)
109{ 111{
110 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 112 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
111 u32 mask = 1 << (d->irq - gc->irq_base); 113 struct irq_chip_type *ct = irq_data_get_chip_type(d);
114 u32 mask = d->mask;
112 115
113 irq_gc_lock(gc); 116 irq_gc_lock(gc);
114 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); 117 irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
115 irq_gc_unlock(gc); 118 irq_gc_unlock(gc);
116} 119}
120EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit);
117 121
118/** 122/**
119 * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit 123 * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit
@@ -122,25 +126,27 @@ void irq_gc_ack_set_bit(struct irq_data *d)
122void irq_gc_ack_clr_bit(struct irq_data *d) 126void irq_gc_ack_clr_bit(struct irq_data *d)
123{ 127{
124 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 128 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
125 u32 mask = ~(1 << (d->irq - gc->irq_base)); 129 struct irq_chip_type *ct = irq_data_get_chip_type(d);
130 u32 mask = ~d->mask;
126 131
127 irq_gc_lock(gc); 132 irq_gc_lock(gc);
128 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); 133 irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
129 irq_gc_unlock(gc); 134 irq_gc_unlock(gc);
130} 135}
131 136
132/** 137/**
133 * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt 138 * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt
134 * @d: irq_data 139 * @d: irq_data
135 */ 140 */
136void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) 141void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
137{ 142{
138 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 143 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
139 u32 mask = 1 << (d->irq - gc->irq_base); 144 struct irq_chip_type *ct = irq_data_get_chip_type(d);
145 u32 mask = d->mask;
140 146
141 irq_gc_lock(gc); 147 irq_gc_lock(gc);
142 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask); 148 irq_reg_writel(mask, gc->reg_base + ct->regs.mask);
143 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); 149 irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
144 irq_gc_unlock(gc); 150 irq_gc_unlock(gc);
145} 151}
146 152
@@ -151,16 +157,18 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
151void irq_gc_eoi(struct irq_data *d) 157void irq_gc_eoi(struct irq_data *d)
152{ 158{
153 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 159 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
154 u32 mask = 1 << (d->irq - gc->irq_base); 160 struct irq_chip_type *ct = irq_data_get_chip_type(d);
161 u32 mask = d->mask;
155 162
156 irq_gc_lock(gc); 163 irq_gc_lock(gc);
157 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi); 164 irq_reg_writel(mask, gc->reg_base + ct->regs.eoi);
158 irq_gc_unlock(gc); 165 irq_gc_unlock(gc);
159} 166}
160 167
161/** 168/**
162 * irq_gc_set_wake - Set/clr wake bit for an interrupt 169 * irq_gc_set_wake - Set/clr wake bit for an interrupt
163 * @d: irq_data 170 * @d: irq_data
171 * @on: Indicates whether the wake bit should be set or cleared
164 * 172 *
165 * For chips where the wake from suspend functionality is not 173 * For chips where the wake from suspend functionality is not
166 * configured in a separate register and the wakeup active state is 174 * configured in a separate register and the wakeup active state is
@@ -169,7 +177,7 @@ void irq_gc_eoi(struct irq_data *d)
169int irq_gc_set_wake(struct irq_data *d, unsigned int on) 177int irq_gc_set_wake(struct irq_data *d, unsigned int on)
170{ 178{
171 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 179 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
172 u32 mask = 1 << (d->irq - gc->irq_base); 180 u32 mask = d->mask;
173 181
174 if (!(mask & gc->wake_enabled)) 182 if (!(mask & gc->wake_enabled))
175 return -EINVAL; 183 return -EINVAL;
@@ -183,6 +191,19 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
183 return 0; 191 return 0;
184} 192}
185 193
194static void
195irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
196 int num_ct, unsigned int irq_base,
197 void __iomem *reg_base, irq_flow_handler_t handler)
198{
199 raw_spin_lock_init(&gc->lock);
200 gc->num_ct = num_ct;
201 gc->irq_base = irq_base;
202 gc->reg_base = reg_base;
203 gc->chip_types->chip.name = name;
204 gc->chip_types->handler = handler;
205}
206
186/** 207/**
187 * irq_alloc_generic_chip - Allocate a generic chip and initialize it 208 * irq_alloc_generic_chip - Allocate a generic chip and initialize it
188 * @name: Name of the irq chip 209 * @name: Name of the irq chip
@@ -203,23 +224,183 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
203 224
204 gc = kzalloc(sz, GFP_KERNEL); 225 gc = kzalloc(sz, GFP_KERNEL);
205 if (gc) { 226 if (gc) {
206 raw_spin_lock_init(&gc->lock); 227 irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base,
207 gc->num_ct = num_ct; 228 handler);
208 gc->irq_base = irq_base;
209 gc->reg_base = reg_base;
210 gc->chip_types->chip.name = name;
211 gc->chip_types->handler = handler;
212 } 229 }
213 return gc; 230 return gc;
214} 231}
215EXPORT_SYMBOL_GPL(irq_alloc_generic_chip); 232EXPORT_SYMBOL_GPL(irq_alloc_generic_chip);
216 233
234static void
235irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
236{
237 struct irq_chip_type *ct = gc->chip_types;
238 u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask;
239 int i;
240
241 for (i = 0; i < gc->num_ct; i++) {
242 if (flags & IRQ_GC_MASK_CACHE_PER_TYPE) {
243 mskptr = &ct[i].mask_cache_priv;
244 mskreg = ct[i].regs.mask;
245 }
246 ct[i].mask_cache = mskptr;
247 if (flags & IRQ_GC_INIT_MASK_CACHE)
248 *mskptr = irq_reg_readl(gc->reg_base + mskreg);
249 }
250}
251
252/**
253 * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain
254 * @d: irq domain for which to allocate chips
255 * @irqs_per_chip: Number of interrupts each chip handles
256 * @num_ct: Number of irq_chip_type instances associated with this
257 * @name: Name of the irq chip
258 * @handler: Default flow handler associated with these chips
259 * @clr: IRQ_* bits to clear in the mapping function
260 * @set: IRQ_* bits to set in the mapping function
261 * @gcflags: Generic chip specific setup flags
262 */
263int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
264 int num_ct, const char *name,
265 irq_flow_handler_t handler,
266 unsigned int clr, unsigned int set,
267 enum irq_gc_flags gcflags)
268{
269 struct irq_domain_chip_generic *dgc;
270 struct irq_chip_generic *gc;
271 int numchips, sz, i;
272 unsigned long flags;
273 void *tmp;
274
275 if (d->gc)
276 return -EBUSY;
277
278 numchips = DIV_ROUND_UP(d->revmap_size, irqs_per_chip);
279 if (!numchips)
280 return -EINVAL;
281
282 /* Allocate a pointer, generic chip and chiptypes for each chip */
283 sz = sizeof(*dgc) + numchips * sizeof(gc);
284 sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type));
285
286 tmp = dgc = kzalloc(sz, GFP_KERNEL);
287 if (!dgc)
288 return -ENOMEM;
289 dgc->irqs_per_chip = irqs_per_chip;
290 dgc->num_chips = numchips;
291 dgc->irq_flags_to_set = set;
292 dgc->irq_flags_to_clear = clr;
293 dgc->gc_flags = gcflags;
294 d->gc = dgc;
295
296 /* Calc pointer to the first generic chip */
297 tmp += sizeof(*dgc) + numchips * sizeof(gc);
298 for (i = 0; i < numchips; i++) {
299 /* Store the pointer to the generic chip */
300 dgc->gc[i] = gc = tmp;
301 irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip,
302 NULL, handler);
303 gc->domain = d;
304 raw_spin_lock_irqsave(&gc_lock, flags);
305 list_add_tail(&gc->list, &gc_list);
306 raw_spin_unlock_irqrestore(&gc_lock, flags);
307 /* Calc pointer to the next generic chip */
308 tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
309 }
310 d->name = name;
311 return 0;
312}
313EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
314
315/**
316 * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq
317 * @d: irq domain pointer
318 * @hw_irq: Hardware interrupt number
319 */
320struct irq_chip_generic *
321irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
322{
323 struct irq_domain_chip_generic *dgc = d->gc;
324 int idx;
325
326 if (!dgc)
327 return NULL;
328 idx = hw_irq / dgc->irqs_per_chip;
329 if (idx >= dgc->num_chips)
330 return NULL;
331 return dgc->gc[idx];
332}
333EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
334
217/* 335/*
218 * Separate lockdep class for interrupt chip which can nest irq_desc 336 * Separate lockdep class for interrupt chip which can nest irq_desc
219 * lock. 337 * lock.
220 */ 338 */
221static struct lock_class_key irq_nested_lock_class; 339static struct lock_class_key irq_nested_lock_class;
222 340
341/*
342 * irq_map_generic_chip - Map a generic chip for an irq domain
343 */
344static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
345 irq_hw_number_t hw_irq)
346{
347 struct irq_data *data = irq_get_irq_data(virq);
348 struct irq_domain_chip_generic *dgc = d->gc;
349 struct irq_chip_generic *gc;
350 struct irq_chip_type *ct;
351 struct irq_chip *chip;
352 unsigned long flags;
353 int idx;
354
355 if (!d->gc)
356 return -ENODEV;
357
358 idx = hw_irq / dgc->irqs_per_chip;
359 if (idx >= dgc->num_chips)
360 return -EINVAL;
361 gc = dgc->gc[idx];
362
363 idx = hw_irq % dgc->irqs_per_chip;
364
365 if (test_bit(idx, &gc->unused))
366 return -ENOTSUPP;
367
368 if (test_bit(idx, &gc->installed))
369 return -EBUSY;
370
371 ct = gc->chip_types;
372 chip = &ct->chip;
373
374 /* We only init the cache for the first mapping of a generic chip */
375 if (!gc->installed) {
376 raw_spin_lock_irqsave(&gc->lock, flags);
377 irq_gc_init_mask_cache(gc, dgc->gc_flags);
378 raw_spin_unlock_irqrestore(&gc->lock, flags);
379 }
380
381 /* Mark the interrupt as installed */
382 set_bit(idx, &gc->installed);
383
384 if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK)
385 irq_set_lockdep_class(virq, &irq_nested_lock_class);
386
387 if (chip->irq_calc_mask)
388 chip->irq_calc_mask(data);
389 else
390 data->mask = 1 << idx;
391
392 irq_set_chip_and_handler(virq, chip, ct->handler);
393 irq_set_chip_data(virq, gc);
394 irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
395 return 0;
396}
397
398struct irq_domain_ops irq_generic_chip_ops = {
399 .map = irq_map_generic_chip,
400 .xlate = irq_domain_xlate_onetwocell,
401};
402EXPORT_SYMBOL_GPL(irq_generic_chip_ops);
403
223/** 404/**
224 * irq_setup_generic_chip - Setup a range of interrupts with a generic chip 405 * irq_setup_generic_chip - Setup a range of interrupts with a generic chip
225 * @gc: Generic irq chip holding all data 406 * @gc: Generic irq chip holding all data
@@ -237,15 +418,14 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
237 unsigned int set) 418 unsigned int set)
238{ 419{
239 struct irq_chip_type *ct = gc->chip_types; 420 struct irq_chip_type *ct = gc->chip_types;
421 struct irq_chip *chip = &ct->chip;
240 unsigned int i; 422 unsigned int i;
241 423
242 raw_spin_lock(&gc_lock); 424 raw_spin_lock(&gc_lock);
243 list_add_tail(&gc->list, &gc_list); 425 list_add_tail(&gc->list, &gc_list);
244 raw_spin_unlock(&gc_lock); 426 raw_spin_unlock(&gc_lock);
245 427
246 /* Init mask cache ? */ 428 irq_gc_init_mask_cache(gc, flags);
247 if (flags & IRQ_GC_INIT_MASK_CACHE)
248 gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
249 429
250 for (i = gc->irq_base; msk; msk >>= 1, i++) { 430 for (i = gc->irq_base; msk; msk >>= 1, i++) {
251 if (!(msk & 0x01)) 431 if (!(msk & 0x01))
@@ -254,7 +434,15 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
254 if (flags & IRQ_GC_INIT_NESTED_LOCK) 434 if (flags & IRQ_GC_INIT_NESTED_LOCK)
255 irq_set_lockdep_class(i, &irq_nested_lock_class); 435 irq_set_lockdep_class(i, &irq_nested_lock_class);
256 436
257 irq_set_chip_and_handler(i, &ct->chip, ct->handler); 437 if (!(flags & IRQ_GC_NO_MASK)) {
438 struct irq_data *d = irq_get_irq_data(i);
439
440 if (chip->irq_calc_mask)
441 chip->irq_calc_mask(d);
442 else
443 d->mask = 1 << (i - gc->irq_base);
444 }
445 irq_set_chip_and_handler(i, chip, ct->handler);
258 irq_set_chip_data(i, gc); 446 irq_set_chip_data(i, gc);
259 irq_modify_status(i, clr, set); 447 irq_modify_status(i, clr, set);
260 } 448 }
@@ -265,7 +453,7 @@ EXPORT_SYMBOL_GPL(irq_setup_generic_chip);
265/** 453/**
266 * irq_setup_alt_chip - Switch to alternative chip 454 * irq_setup_alt_chip - Switch to alternative chip
267 * @d: irq_data for this interrupt 455 * @d: irq_data for this interrupt
268 * @type Flow type to be initialized 456 * @type: Flow type to be initialized
269 * 457 *
270 * Only to be called from chip->irq_set_type() callbacks. 458 * Only to be called from chip->irq_set_type() callbacks.
271 */ 459 */
@@ -317,6 +505,24 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
317} 505}
318EXPORT_SYMBOL_GPL(irq_remove_generic_chip); 506EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
319 507
508static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc)
509{
510 unsigned int virq;
511
512 if (!gc->domain)
513 return irq_get_irq_data(gc->irq_base);
514
515 /*
516 * We don't know which of the irqs has been actually
517 * installed. Use the first one.
518 */
519 if (!gc->installed)
520 return NULL;
521
522 virq = irq_find_mapping(gc->domain, gc->irq_base + __ffs(gc->installed));
523 return virq ? irq_get_irq_data(virq) : NULL;
524}
525
320#ifdef CONFIG_PM 526#ifdef CONFIG_PM
321static int irq_gc_suspend(void) 527static int irq_gc_suspend(void)
322{ 528{
@@ -325,8 +531,12 @@ static int irq_gc_suspend(void)
325 list_for_each_entry(gc, &gc_list, list) { 531 list_for_each_entry(gc, &gc_list, list) {
326 struct irq_chip_type *ct = gc->chip_types; 532 struct irq_chip_type *ct = gc->chip_types;
327 533
328 if (ct->chip.irq_suspend) 534 if (ct->chip.irq_suspend) {
329 ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base)); 535 struct irq_data *data = irq_gc_get_irq_data(gc);
536
537 if (data)
538 ct->chip.irq_suspend(data);
539 }
330 } 540 }
331 return 0; 541 return 0;
332} 542}
@@ -338,8 +548,12 @@ static void irq_gc_resume(void)
338 list_for_each_entry(gc, &gc_list, list) { 548 list_for_each_entry(gc, &gc_list, list) {
339 struct irq_chip_type *ct = gc->chip_types; 549 struct irq_chip_type *ct = gc->chip_types;
340 550
341 if (ct->chip.irq_resume) 551 if (ct->chip.irq_resume) {
342 ct->chip.irq_resume(irq_get_irq_data(gc->irq_base)); 552 struct irq_data *data = irq_gc_get_irq_data(gc);
553
554 if (data)
555 ct->chip.irq_resume(data);
556 }
343 } 557 }
344} 558}
345#else 559#else
@@ -354,8 +568,12 @@ static void irq_gc_shutdown(void)
354 list_for_each_entry(gc, &gc_list, list) { 568 list_for_each_entry(gc, &gc_list, list) {
355 struct irq_chip_type *ct = gc->chip_types; 569 struct irq_chip_type *ct = gc->chip_types;
356 570
357 if (ct->chip.irq_pm_shutdown) 571 if (ct->chip.irq_pm_shutdown) {
358 ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base)); 572 struct irq_data *data = irq_gc_get_irq_data(gc);
573
574 if (data)
575 ct->chip.irq_pm_shutdown(data);
576 }
359 } 577 }
360} 578}
361 579
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 54a4d5223238..706724e9835d 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -16,12 +16,6 @@
16#include <linux/smp.h> 16#include <linux/smp.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18 18
19#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs.
20 * ie. legacy 8259, gets irqs 1..15 */
21#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */
22#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */
23#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */
24
25static LIST_HEAD(irq_domain_list); 19static LIST_HEAD(irq_domain_list);
26static DEFINE_MUTEX(irq_domain_mutex); 20static DEFINE_MUTEX(irq_domain_mutex);
27 21
@@ -29,9 +23,11 @@ static DEFINE_MUTEX(revmap_trees_mutex);
29static struct irq_domain *irq_default_domain; 23static struct irq_domain *irq_default_domain;
30 24
31/** 25/**
32 * irq_domain_alloc() - Allocate a new irq_domain data structure 26 * __irq_domain_add() - Allocate a new irq_domain data structure
33 * @of_node: optional device-tree node of the interrupt controller 27 * @of_node: optional device-tree node of the interrupt controller
34 * @revmap_type: type of reverse mapping to use 28 * @size: Size of linear map; 0 for radix mapping only
29 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
30 * direct mapping
35 * @ops: map/unmap domain callbacks 31 * @ops: map/unmap domain callbacks
36 * @host_data: Controller private data pointer 32 * @host_data: Controller private data pointer
37 * 33 *
@@ -39,41 +35,35 @@ static struct irq_domain *irq_default_domain;
39 * register allocated irq_domain with irq_domain_register(). Returns pointer 35 * register allocated irq_domain with irq_domain_register(). Returns pointer
40 * to IRQ domain, or NULL on failure. 36 * to IRQ domain, or NULL on failure.
41 */ 37 */
42static struct irq_domain *irq_domain_alloc(struct device_node *of_node, 38struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
43 unsigned int revmap_type, 39 irq_hw_number_t hwirq_max, int direct_max,
44 const struct irq_domain_ops *ops, 40 const struct irq_domain_ops *ops,
45 void *host_data) 41 void *host_data)
46{ 42{
47 struct irq_domain *domain; 43 struct irq_domain *domain;
48 44
49 domain = kzalloc_node(sizeof(*domain), GFP_KERNEL, 45 domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
50 of_node_to_nid(of_node)); 46 GFP_KERNEL, of_node_to_nid(of_node));
51 if (WARN_ON(!domain)) 47 if (WARN_ON(!domain))
52 return NULL; 48 return NULL;
53 49
54 /* Fill structure */ 50 /* Fill structure */
55 domain->revmap_type = revmap_type; 51 INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
56 domain->ops = ops; 52 domain->ops = ops;
57 domain->host_data = host_data; 53 domain->host_data = host_data;
58 domain->of_node = of_node_get(of_node); 54 domain->of_node = of_node_get(of_node);
55 domain->hwirq_max = hwirq_max;
56 domain->revmap_size = size;
57 domain->revmap_direct_max_irq = direct_max;
59 58
60 return domain;
61}
62
63static void irq_domain_free(struct irq_domain *domain)
64{
65 of_node_put(domain->of_node);
66 kfree(domain);
67}
68
69static void irq_domain_add(struct irq_domain *domain)
70{
71 mutex_lock(&irq_domain_mutex); 59 mutex_lock(&irq_domain_mutex);
72 list_add(&domain->link, &irq_domain_list); 60 list_add(&domain->link, &irq_domain_list);
73 mutex_unlock(&irq_domain_mutex); 61 mutex_unlock(&irq_domain_mutex);
74 pr_debug("Allocated domain of type %d @0x%p\n", 62
75 domain->revmap_type, domain); 63 pr_debug("Added domain %s\n", domain->name);
64 return domain;
76} 65}
66EXPORT_SYMBOL_GPL(__irq_domain_add);
77 67
78/** 68/**
79 * irq_domain_remove() - Remove an irq domain. 69 * irq_domain_remove() - Remove an irq domain.
@@ -87,29 +77,12 @@ void irq_domain_remove(struct irq_domain *domain)
87{ 77{
88 mutex_lock(&irq_domain_mutex); 78 mutex_lock(&irq_domain_mutex);
89 79
90 switch (domain->revmap_type) { 80 /*
91 case IRQ_DOMAIN_MAP_LEGACY: 81 * radix_tree_delete() takes care of destroying the root
92 /* 82 * node when all entries are removed. Shout if there are
93 * Legacy domains don't manage their own irq_desc 83 * any mappings left.
94 * allocations, we expect the caller to handle irq_desc 84 */
95 * freeing on their own. 85 WARN_ON(domain->revmap_tree.height);
96 */
97 break;
98 case IRQ_DOMAIN_MAP_TREE:
99 /*
100 * radix_tree_delete() takes care of destroying the root
101 * node when all entries are removed. Shout if there are
102 * any mappings left.
103 */
104 WARN_ON(domain->revmap_data.tree.height);
105 break;
106 case IRQ_DOMAIN_MAP_LINEAR:
107 kfree(domain->revmap_data.linear.revmap);
108 domain->revmap_data.linear.size = 0;
109 break;
110 case IRQ_DOMAIN_MAP_NOMAP:
111 break;
112 }
113 86
114 list_del(&domain->link); 87 list_del(&domain->link);
115 88
@@ -121,44 +94,30 @@ void irq_domain_remove(struct irq_domain *domain)
121 94
122 mutex_unlock(&irq_domain_mutex); 95 mutex_unlock(&irq_domain_mutex);
123 96
124 pr_debug("Removed domain of type %d @0x%p\n", 97 pr_debug("Removed domain %s\n", domain->name);
125 domain->revmap_type, domain);
126 98
127 irq_domain_free(domain); 99 of_node_put(domain->of_node);
100 kfree(domain);
128} 101}
129EXPORT_SYMBOL_GPL(irq_domain_remove); 102EXPORT_SYMBOL_GPL(irq_domain_remove);
130 103
131static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
132 irq_hw_number_t hwirq)
133{
134 irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq;
135 int size = domain->revmap_data.legacy.size;
136
137 if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size))
138 return 0;
139 return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq;
140}
141
142/** 104/**
143 * irq_domain_add_simple() - Allocate and register a simple irq_domain. 105 * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs
144 * @of_node: pointer to interrupt controller's device tree node. 106 * @of_node: pointer to interrupt controller's device tree node.
145 * @size: total number of irqs in mapping 107 * @size: total number of irqs in mapping
146 * @first_irq: first number of irq block assigned to the domain, 108 * @first_irq: first number of irq block assigned to the domain,
147 * pass zero to assign irqs on-the-fly. This will result in a 109 * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then
148 * linear IRQ domain so it is important to use irq_create_mapping() 110 * pre-map all of the irqs in the domain to virqs starting at first_irq.
149 * for each used IRQ, especially when SPARSE_IRQ is enabled.
150 * @ops: map/unmap domain callbacks 111 * @ops: map/unmap domain callbacks
151 * @host_data: Controller private data pointer 112 * @host_data: Controller private data pointer
152 * 113 *
153 * Allocates a legacy irq_domain if irq_base is positive or a linear 114 * Allocates an irq_domain, and optionally if first_irq is positive then also
154 * domain otherwise. For the legacy domain, IRQ descriptors will also 115 * allocate irq_descs and map all of the hwirqs to virqs starting at first_irq.
155 * be allocated.
156 * 116 *
157 * This is intended to implement the expected behaviour for most 117 * This is intended to implement the expected behaviour for most
158 * interrupt controllers which is that a linear mapping should 118 * interrupt controllers. If device tree is used, then first_irq will be 0 and
159 * normally be used unless the system requires a legacy mapping in 119 * irqs get mapped dynamically on the fly. However, if the controller requires
160 * order to support supplying interrupt numbers during non-DT 120 * static virq assignments (non-DT boot) then it will set that up correctly.
161 * registration of devices.
162 */ 121 */
163struct irq_domain *irq_domain_add_simple(struct device_node *of_node, 122struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
164 unsigned int size, 123 unsigned int size,
@@ -166,33 +125,25 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
166 const struct irq_domain_ops *ops, 125 const struct irq_domain_ops *ops,
167 void *host_data) 126 void *host_data)
168{ 127{
169 if (first_irq > 0) { 128 struct irq_domain *domain;
170 int irq_base;
171 129
130 domain = __irq_domain_add(of_node, size, size, 0, ops, host_data);
131 if (!domain)
132 return NULL;
133
134 if (first_irq > 0) {
172 if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { 135 if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
173 /* 136 /* attempt to allocated irq_descs */
174 * Set the descriptor allocator to search for a 137 int rc = irq_alloc_descs(first_irq, first_irq, size,
175 * 1-to-1 mapping, such as irq_alloc_desc_at(). 138 of_node_to_nid(of_node));
176 * Use of_node_to_nid() which is defined to 139 if (rc < 0)
177 * numa_node_id() on platforms that have no custom
178 * implementation.
179 */
180 irq_base = irq_alloc_descs(first_irq, first_irq, size,
181 of_node_to_nid(of_node));
182 if (irq_base < 0) {
183 pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", 140 pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
184 first_irq); 141 first_irq);
185 irq_base = first_irq; 142 }
186 } 143 irq_domain_associate_many(domain, first_irq, 0, size);
187 } else
188 irq_base = first_irq;
189
190 return irq_domain_add_legacy(of_node, size, irq_base, 0,
191 ops, host_data);
192 } 144 }
193 145
194 /* A linear domain is the default */ 146 return domain;
195 return irq_domain_add_linear(of_node, size, ops, host_data);
196} 147}
197EXPORT_SYMBOL_GPL(irq_domain_add_simple); 148EXPORT_SYMBOL_GPL(irq_domain_add_simple);
198 149
@@ -219,131 +170,19 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
219 void *host_data) 170 void *host_data)
220{ 171{
221 struct irq_domain *domain; 172 struct irq_domain *domain;
222 unsigned int i;
223 173
224 domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data); 174 domain = __irq_domain_add(of_node, first_hwirq + size,
175 first_hwirq + size, 0, ops, host_data);
225 if (!domain) 176 if (!domain)
226 return NULL; 177 return NULL;
227 178
228 domain->revmap_data.legacy.first_irq = first_irq; 179 irq_domain_associate_many(domain, first_irq, first_hwirq, size);
229 domain->revmap_data.legacy.first_hwirq = first_hwirq;
230 domain->revmap_data.legacy.size = size;
231
232 mutex_lock(&irq_domain_mutex);
233 /* Verify that all the irqs are available */
234 for (i = 0; i < size; i++) {
235 int irq = first_irq + i;
236 struct irq_data *irq_data = irq_get_irq_data(irq);
237
238 if (WARN_ON(!irq_data || irq_data->domain)) {
239 mutex_unlock(&irq_domain_mutex);
240 irq_domain_free(domain);
241 return NULL;
242 }
243 }
244
245 /* Claim all of the irqs before registering a legacy domain */
246 for (i = 0; i < size; i++) {
247 struct irq_data *irq_data = irq_get_irq_data(first_irq + i);
248 irq_data->hwirq = first_hwirq + i;
249 irq_data->domain = domain;
250 }
251 mutex_unlock(&irq_domain_mutex);
252
253 for (i = 0; i < size; i++) {
254 int irq = first_irq + i;
255 int hwirq = first_hwirq + i;
256
257 /* IRQ0 gets ignored */
258 if (!irq)
259 continue;
260 180
261 /* Legacy flags are left to default at this point,
262 * one can then use irq_create_mapping() to
263 * explicitly change them
264 */
265 if (ops->map)
266 ops->map(domain, irq, hwirq);
267
268 /* Clear norequest flags */
269 irq_clear_status_flags(irq, IRQ_NOREQUEST);
270 }
271
272 irq_domain_add(domain);
273 return domain; 181 return domain;
274} 182}
275EXPORT_SYMBOL_GPL(irq_domain_add_legacy); 183EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
276 184
277/** 185/**
278 * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain.
279 * @of_node: pointer to interrupt controller's device tree node.
280 * @size: Number of interrupts in the domain.
281 * @ops: map/unmap domain callbacks
282 * @host_data: Controller private data pointer
283 */
284struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
285 unsigned int size,
286 const struct irq_domain_ops *ops,
287 void *host_data)
288{
289 struct irq_domain *domain;
290 unsigned int *revmap;
291
292 revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL,
293 of_node_to_nid(of_node));
294 if (WARN_ON(!revmap))
295 return NULL;
296
297 domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data);
298 if (!domain) {
299 kfree(revmap);
300 return NULL;
301 }
302 domain->revmap_data.linear.size = size;
303 domain->revmap_data.linear.revmap = revmap;
304 irq_domain_add(domain);
305 return domain;
306}
307EXPORT_SYMBOL_GPL(irq_domain_add_linear);
308
309struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
310 unsigned int max_irq,
311 const struct irq_domain_ops *ops,
312 void *host_data)
313{
314 struct irq_domain *domain = irq_domain_alloc(of_node,
315 IRQ_DOMAIN_MAP_NOMAP, ops, host_data);
316 if (domain) {
317 domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0;
318 irq_domain_add(domain);
319 }
320 return domain;
321}
322EXPORT_SYMBOL_GPL(irq_domain_add_nomap);
323
324/**
325 * irq_domain_add_tree()
326 * @of_node: pointer to interrupt controller's device tree node.
327 * @ops: map/unmap domain callbacks
328 *
329 * Note: The radix tree will be allocated later during boot automatically
330 * (the reverse mapping will use the slow path until that happens).
331 */
332struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
333 const struct irq_domain_ops *ops,
334 void *host_data)
335{
336 struct irq_domain *domain = irq_domain_alloc(of_node,
337 IRQ_DOMAIN_MAP_TREE, ops, host_data);
338 if (domain) {
339 INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL);
340 irq_domain_add(domain);
341 }
342 return domain;
343}
344EXPORT_SYMBOL_GPL(irq_domain_add_tree);
345
346/**
347 * irq_find_host() - Locates a domain for a given device node 186 * irq_find_host() - Locates a domain for a given device node
348 * @node: device-tree node of the interrupt controller 187 * @node: device-tree node of the interrupt controller
349 */ 188 */
@@ -391,125 +230,108 @@ void irq_set_default_host(struct irq_domain *domain)
391} 230}
392EXPORT_SYMBOL_GPL(irq_set_default_host); 231EXPORT_SYMBOL_GPL(irq_set_default_host);
393 232
394static void irq_domain_disassociate_many(struct irq_domain *domain, 233static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
395 unsigned int irq_base, int count)
396{ 234{
397 /* 235 struct irq_data *irq_data = irq_get_irq_data(irq);
398 * disassociate in reverse order; 236 irq_hw_number_t hwirq;
399 * not strictly necessary, but nice for unwinding
400 */
401 while (count--) {
402 int irq = irq_base + count;
403 struct irq_data *irq_data = irq_get_irq_data(irq);
404 irq_hw_number_t hwirq;
405 237
406 if (WARN_ON(!irq_data || irq_data->domain != domain)) 238 if (WARN(!irq_data || irq_data->domain != domain,
407 continue; 239 "virq%i doesn't exist; cannot disassociate\n", irq))
240 return;
408 241
409 hwirq = irq_data->hwirq; 242 hwirq = irq_data->hwirq;
410 irq_set_status_flags(irq, IRQ_NOREQUEST); 243 irq_set_status_flags(irq, IRQ_NOREQUEST);
411 244
412 /* remove chip and handler */ 245 /* remove chip and handler */
413 irq_set_chip_and_handler(irq, NULL, NULL); 246 irq_set_chip_and_handler(irq, NULL, NULL);
414 247
415 /* Make sure it's completed */ 248 /* Make sure it's completed */
416 synchronize_irq(irq); 249 synchronize_irq(irq);
417 250
418 /* Tell the PIC about it */ 251 /* Tell the PIC about it */
419 if (domain->ops->unmap) 252 if (domain->ops->unmap)
420 domain->ops->unmap(domain, irq); 253 domain->ops->unmap(domain, irq);
421 smp_mb(); 254 smp_mb();
422 255
423 irq_data->domain = NULL; 256 irq_data->domain = NULL;
424 irq_data->hwirq = 0; 257 irq_data->hwirq = 0;
425 258
426 /* Clear reverse map */ 259 /* Clear reverse map for this hwirq */
427 switch(domain->revmap_type) { 260 if (hwirq < domain->revmap_size) {
428 case IRQ_DOMAIN_MAP_LINEAR: 261 domain->linear_revmap[hwirq] = 0;
429 if (hwirq < domain->revmap_data.linear.size) 262 } else {
430 domain->revmap_data.linear.revmap[hwirq] = 0; 263 mutex_lock(&revmap_trees_mutex);
431 break; 264 radix_tree_delete(&domain->revmap_tree, hwirq);
432 case IRQ_DOMAIN_MAP_TREE: 265 mutex_unlock(&revmap_trees_mutex);
433 mutex_lock(&revmap_trees_mutex);
434 radix_tree_delete(&domain->revmap_data.tree, hwirq);
435 mutex_unlock(&revmap_trees_mutex);
436 break;
437 }
438 } 266 }
439} 267}
440 268
441int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, 269int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
442 irq_hw_number_t hwirq_base, int count) 270 irq_hw_number_t hwirq)
443{ 271{
444 unsigned int virq = irq_base; 272 struct irq_data *irq_data = irq_get_irq_data(virq);
445 irq_hw_number_t hwirq = hwirq_base; 273 int ret;
446 int i, ret;
447 274
448 pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, 275 if (WARN(hwirq >= domain->hwirq_max,
449 of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); 276 "error: hwirq 0x%x is too large for %s\n", (int)hwirq, domain->name))
277 return -EINVAL;
278 if (WARN(!irq_data, "error: virq%i is not allocated", virq))
279 return -EINVAL;
280 if (WARN(irq_data->domain, "error: virq%i is already associated", virq))
281 return -EINVAL;
450 282
451 for (i = 0; i < count; i++) { 283 mutex_lock(&irq_domain_mutex);
452 struct irq_data *irq_data = irq_get_irq_data(virq + i); 284 irq_data->hwirq = hwirq;
453 285 irq_data->domain = domain;
454 if (WARN(!irq_data, "error: irq_desc not allocated; " 286 if (domain->ops->map) {
455 "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) 287 ret = domain->ops->map(domain, virq, hwirq);
456 return -EINVAL; 288 if (ret != 0) {
457 if (WARN(irq_data->domain, "error: irq_desc already associated; " 289 /*
458 "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) 290 * If map() returns -EPERM, this interrupt is protected
459 return -EINVAL; 291 * by the firmware or some other service and shall not
460 }; 292 * be mapped. Don't bother telling the user about it.
461 293 */
462 for (i = 0; i < count; i++, virq++, hwirq++) { 294 if (ret != -EPERM) {
463 struct irq_data *irq_data = irq_get_irq_data(virq); 295 pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n",
464 296 domain->name, hwirq, virq, ret);
465 irq_data->hwirq = hwirq;
466 irq_data->domain = domain;
467 if (domain->ops->map) {
468 ret = domain->ops->map(domain, virq, hwirq);
469 if (ret != 0) {
470 /*
471 * If map() returns -EPERM, this interrupt is protected
472 * by the firmware or some other service and shall not
473 * be mapped.
474 *
475 * Since on some platforms we blindly try to map everything
476 * we end up with a log full of backtraces.
477 *
478 * So instead, we silently fail on -EPERM, it is the
479 * responsibility of the PIC driver to display a relevant
480 * message if needed.
481 */
482 if (ret != -EPERM) {
483 pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
484 virq, hwirq, ret);
485 WARN_ON(1);
486 }
487 irq_data->domain = NULL;
488 irq_data->hwirq = 0;
489 goto err_unmap;
490 } 297 }
298 irq_data->domain = NULL;
299 irq_data->hwirq = 0;
300 mutex_unlock(&irq_domain_mutex);
301 return ret;
491 } 302 }
492 303
493 switch (domain->revmap_type) { 304 /* If not already assigned, give the domain the chip's name */
494 case IRQ_DOMAIN_MAP_LINEAR: 305 if (!domain->name && irq_data->chip)
495 if (hwirq < domain->revmap_data.linear.size) 306 domain->name = irq_data->chip->name;
496 domain->revmap_data.linear.revmap[hwirq] = virq; 307 }
497 break;
498 case IRQ_DOMAIN_MAP_TREE:
499 mutex_lock(&revmap_trees_mutex);
500 radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
501 mutex_unlock(&revmap_trees_mutex);
502 break;
503 }
504 308
505 irq_clear_status_flags(virq, IRQ_NOREQUEST); 309 if (hwirq < domain->revmap_size) {
310 domain->linear_revmap[hwirq] = virq;
311 } else {
312 mutex_lock(&revmap_trees_mutex);
313 radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
314 mutex_unlock(&revmap_trees_mutex);
506 } 315 }
316 mutex_unlock(&irq_domain_mutex);
317
318 irq_clear_status_flags(virq, IRQ_NOREQUEST);
507 319
508 return 0; 320 return 0;
321}
322EXPORT_SYMBOL_GPL(irq_domain_associate);
509 323
510 err_unmap: 324void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
511 irq_domain_disassociate_many(domain, irq_base, i); 325 irq_hw_number_t hwirq_base, int count)
512 return -EINVAL; 326{
327 int i;
328
329 pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
330 of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count);
331
332 for (i = 0; i < count; i++) {
333 irq_domain_associate(domain, irq_base + i, hwirq_base + i);
334 }
513} 335}
514EXPORT_SYMBOL_GPL(irq_domain_associate_many); 336EXPORT_SYMBOL_GPL(irq_domain_associate_many);
515 337
@@ -519,7 +341,9 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many);
519 * 341 *
520 * This routine is used for irq controllers which can choose the hardware 342 * This routine is used for irq controllers which can choose the hardware
521 * interrupt numbers they generate. In such a case it's simplest to use 343 * interrupt numbers they generate. In such a case it's simplest to use
522 * the linux irq as the hardware interrupt number. 344 * the linux irq as the hardware interrupt number. It still uses the linear
345 * or radix tree to store the mapping, but the irq controller can optimize
346 * the revmap path by using the hwirq directly.
523 */ 347 */
524unsigned int irq_create_direct_mapping(struct irq_domain *domain) 348unsigned int irq_create_direct_mapping(struct irq_domain *domain)
525{ 349{
@@ -528,17 +352,14 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
528 if (domain == NULL) 352 if (domain == NULL)
529 domain = irq_default_domain; 353 domain = irq_default_domain;
530 354
531 if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP))
532 return 0;
533
534 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); 355 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
535 if (!virq) { 356 if (!virq) {
536 pr_debug("create_direct virq allocation failed\n"); 357 pr_debug("create_direct virq allocation failed\n");
537 return 0; 358 return 0;
538 } 359 }
539 if (virq >= domain->revmap_data.nomap.max_irq) { 360 if (virq >= domain->revmap_direct_max_irq) {
540 pr_err("ERROR: no free irqs available below %i maximum\n", 361 pr_err("ERROR: no free irqs available below %i maximum\n",
541 domain->revmap_data.nomap.max_irq); 362 domain->revmap_direct_max_irq);
542 irq_free_desc(virq); 363 irq_free_desc(virq);
543 return 0; 364 return 0;
544 } 365 }
@@ -575,9 +396,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
575 if (domain == NULL) 396 if (domain == NULL)
576 domain = irq_default_domain; 397 domain = irq_default_domain;
577 if (domain == NULL) { 398 if (domain == NULL) {
578 pr_warning("irq_create_mapping called for" 399 WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq);
579 " NULL domain, hwirq=%lx\n", hwirq);
580 WARN_ON(1);
581 return 0; 400 return 0;
582 } 401 }
583 pr_debug("-> using domain @%p\n", domain); 402 pr_debug("-> using domain @%p\n", domain);
@@ -589,10 +408,6 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
589 return virq; 408 return virq;
590 } 409 }
591 410
592 /* Get a virtual interrupt number */
593 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
594 return irq_domain_legacy_revmap(domain, hwirq);
595
596 /* Allocate a virtual interrupt number */ 411 /* Allocate a virtual interrupt number */
597 hint = hwirq % nr_irqs; 412 hint = hwirq % nr_irqs;
598 if (hint == 0) 413 if (hint == 0)
@@ -645,12 +460,7 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
645 if (unlikely(ret < 0)) 460 if (unlikely(ret < 0))
646 return ret; 461 return ret;
647 462
648 ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count); 463 irq_domain_associate_many(domain, irq_base, hwirq_base, count);
649 if (unlikely(ret < 0)) {
650 irq_free_descs(irq_base, count);
651 return ret;
652 }
653
654 return 0; 464 return 0;
655} 465}
656EXPORT_SYMBOL_GPL(irq_create_strict_mappings); 466EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
@@ -665,20 +475,8 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
665 475
666 domain = controller ? irq_find_host(controller) : irq_default_domain; 476 domain = controller ? irq_find_host(controller) : irq_default_domain;
667 if (!domain) { 477 if (!domain) {
668#ifdef CONFIG_MIPS 478 pr_warn("no irq domain found for %s !\n",
669 /* 479 of_node_full_name(controller));
670 * Workaround to avoid breaking interrupt controller drivers
671 * that don't yet register an irq_domain. This is temporary
672 * code. ~~~gcl, Feb 24, 2012
673 *
674 * Scheduled for removal in Linux v3.6. That should be enough
675 * time.
676 */
677 if (intsize > 0)
678 return intspec[0];
679#endif
680 pr_warning("no irq domain found for %s !\n",
681 of_node_full_name(controller));
682 return 0; 480 return 0;
683 } 481 }
684 482
@@ -698,7 +496,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
698 496
699 /* Set type if specified and different than the current one */ 497 /* Set type if specified and different than the current one */
700 if (type != IRQ_TYPE_NONE && 498 if (type != IRQ_TYPE_NONE &&
701 type != (irqd_get_trigger_type(irq_get_irq_data(virq)))) 499 type != irq_get_trigger_type(virq))
702 irq_set_irq_type(virq, type); 500 irq_set_irq_type(virq, type);
703 return virq; 501 return virq;
704} 502}
@@ -720,11 +518,7 @@ void irq_dispose_mapping(unsigned int virq)
720 if (WARN_ON(domain == NULL)) 518 if (WARN_ON(domain == NULL))
721 return; 519 return;
722 520
723 /* Never unmap legacy interrupts */ 521 irq_domain_disassociate(domain, virq);
724 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
725 return;
726
727 irq_domain_disassociate_many(domain, virq, 1);
728 irq_free_desc(virq); 522 irq_free_desc(virq);
729} 523}
730EXPORT_SYMBOL_GPL(irq_dispose_mapping); 524EXPORT_SYMBOL_GPL(irq_dispose_mapping);
@@ -745,63 +539,51 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
745 if (domain == NULL) 539 if (domain == NULL)
746 return 0; 540 return 0;
747 541
748 switch (domain->revmap_type) { 542 if (hwirq < domain->revmap_direct_max_irq) {
749 case IRQ_DOMAIN_MAP_LEGACY:
750 return irq_domain_legacy_revmap(domain, hwirq);
751 case IRQ_DOMAIN_MAP_LINEAR:
752 return irq_linear_revmap(domain, hwirq);
753 case IRQ_DOMAIN_MAP_TREE:
754 rcu_read_lock();
755 data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
756 rcu_read_unlock();
757 if (data)
758 return data->irq;
759 break;
760 case IRQ_DOMAIN_MAP_NOMAP:
761 data = irq_get_irq_data(hwirq); 543 data = irq_get_irq_data(hwirq);
762 if (data && (data->domain == domain) && (data->hwirq == hwirq)) 544 if (data && (data->domain == domain) && (data->hwirq == hwirq))
763 return hwirq; 545 return hwirq;
764 break;
765 } 546 }
766 547
767 return 0; 548 /* Check if the hwirq is in the linear revmap. */
768} 549 if (hwirq < domain->revmap_size)
769EXPORT_SYMBOL_GPL(irq_find_mapping); 550 return domain->linear_revmap[hwirq];
770
771/**
772 * irq_linear_revmap() - Find a linux irq from a hw irq number.
773 * @domain: domain owning this hardware interrupt
774 * @hwirq: hardware irq number in that domain space
775 *
776 * This is a fast path that can be called directly by irq controller code to
777 * save a handful of instructions.
778 */
779unsigned int irq_linear_revmap(struct irq_domain *domain,
780 irq_hw_number_t hwirq)
781{
782 BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR);
783
784 /* Check revmap bounds; complain if exceeded */
785 if (WARN_ON(hwirq >= domain->revmap_data.linear.size))
786 return 0;
787 551
788 return domain->revmap_data.linear.revmap[hwirq]; 552 rcu_read_lock();
553 data = radix_tree_lookup(&domain->revmap_tree, hwirq);
554 rcu_read_unlock();
555 return data ? data->irq : 0;
789} 556}
790EXPORT_SYMBOL_GPL(irq_linear_revmap); 557EXPORT_SYMBOL_GPL(irq_find_mapping);
791 558
792#ifdef CONFIG_IRQ_DOMAIN_DEBUG 559#ifdef CONFIG_IRQ_DOMAIN_DEBUG
793static int virq_debug_show(struct seq_file *m, void *private) 560static int virq_debug_show(struct seq_file *m, void *private)
794{ 561{
795 unsigned long flags; 562 unsigned long flags;
796 struct irq_desc *desc; 563 struct irq_desc *desc;
797 const char *p; 564 struct irq_domain *domain;
798 static const char none[] = "none"; 565 struct radix_tree_iter iter;
799 void *data; 566 void *data, **slot;
800 int i; 567 int i;
801 568
802 seq_printf(m, "%-5s %-7s %-15s %-*s %s\n", "irq", "hwirq", 569 seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
570 "name", "mapped", "linear-max", "direct-max", "devtree-node");
571 mutex_lock(&irq_domain_mutex);
572 list_for_each_entry(domain, &irq_domain_list, link) {
573 int count = 0;
574 radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0)
575 count++;
576 seq_printf(m, "%c%-16s %6u %10u %10u %s\n",
577 domain == irq_default_domain ? '*' : ' ', domain->name,
578 domain->revmap_size + count, domain->revmap_size,
579 domain->revmap_direct_max_irq,
580 domain->of_node ? of_node_full_name(domain->of_node) : "");
581 }
582 mutex_unlock(&irq_domain_mutex);
583
584 seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq",
803 "chip name", (int)(2 * sizeof(void *) + 2), "chip data", 585 "chip name", (int)(2 * sizeof(void *) + 2), "chip data",
804 "domain name"); 586 "active", "type", "domain");
805 587
806 for (i = 1; i < nr_irqs; i++) { 588 for (i = 1; i < nr_irqs; i++) {
807 desc = irq_to_desc(i); 589 desc = irq_to_desc(i);
@@ -809,28 +591,28 @@ static int virq_debug_show(struct seq_file *m, void *private)
809 continue; 591 continue;
810 592
811 raw_spin_lock_irqsave(&desc->lock, flags); 593 raw_spin_lock_irqsave(&desc->lock, flags);
594 domain = desc->irq_data.domain;
812 595
813 if (desc->action && desc->action->handler) { 596 if (domain) {
814 struct irq_chip *chip; 597 struct irq_chip *chip;
598 int hwirq = desc->irq_data.hwirq;
599 bool direct;
815 600
816 seq_printf(m, "%5d ", i); 601 seq_printf(m, "%5d ", i);
817 seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); 602 seq_printf(m, "0x%05x ", hwirq);
818 603
819 chip = irq_desc_get_chip(desc); 604 chip = irq_desc_get_chip(desc);
820 if (chip && chip->name) 605 seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
821 p = chip->name;
822 else
823 p = none;
824 seq_printf(m, "%-15s ", p);
825 606
826 data = irq_desc_get_chip_data(desc); 607 data = irq_desc_get_chip_data(desc);
827 seq_printf(m, data ? "0x%p " : " %p ", data); 608 seq_printf(m, data ? "0x%p " : " %p ", data);
828 609
829 if (desc->irq_data.domain) 610 seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
830 p = of_node_full_name(desc->irq_data.domain->of_node); 611 direct = (i == hwirq) && (i < domain->revmap_direct_max_irq);
831 else 612 seq_printf(m, "%6s%-8s ",
832 p = none; 613 (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
833 seq_printf(m, "%s\n", p); 614 direct ? "(DIRECT)" : "");
615 seq_printf(m, "%s\n", desc->irq_data.domain->name);
834 } 616 }
835 617
836 raw_spin_unlock_irqrestore(&desc->lock, flags); 618 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -927,18 +709,3 @@ const struct irq_domain_ops irq_domain_simple_ops = {
927 .xlate = irq_domain_xlate_onetwocell, 709 .xlate = irq_domain_xlate_onetwocell,
928}; 710};
929EXPORT_SYMBOL_GPL(irq_domain_simple_ops); 711EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
930
931#ifdef CONFIG_OF_IRQ
932void irq_domain_generate_simple(const struct of_device_id *match,
933 u64 phys_base, unsigned int irq_start)
934{
935 struct device_node *node;
936 pr_debug("looking for phys_base=%llx, irq_start=%i\n",
937 (unsigned long long) phys_base, (int) irq_start);
938 node = of_find_matching_node_by_address(NULL, match, phys_base);
939 if (node)
940 irq_domain_add_legacy(node, 32, irq_start, 0,
941 &irq_domain_simple_ops, NULL);
942}
943EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
944#endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index fa17855ca65a..514bcfd855a8 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -555,9 +555,9 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
555 return 0; 555 return 0;
556 556
557 if (irq_settings_can_request(desc)) { 557 if (irq_settings_can_request(desc)) {
558 if (desc->action) 558 if (!desc->action ||
559 if (irqflags & desc->action->flags & IRQF_SHARED) 559 irqflags & desc->action->flags & IRQF_SHARED)
560 canrequest =1; 560 canrequest = 1;
561 } 561 }
562 irq_put_desc_unlock(desc, flags); 562 irq_put_desc_unlock(desc, flags);
563 return canrequest; 563 return canrequest;
@@ -840,9 +840,6 @@ static void irq_thread_dtor(struct callback_head *unused)
840static int irq_thread(void *data) 840static int irq_thread(void *data)
841{ 841{
842 struct callback_head on_exit_work; 842 struct callback_head on_exit_work;
843 static const struct sched_param param = {
844 .sched_priority = MAX_USER_RT_PRIO/2,
845 };
846 struct irqaction *action = data; 843 struct irqaction *action = data;
847 struct irq_desc *desc = irq_to_desc(action->irq); 844 struct irq_desc *desc = irq_to_desc(action->irq);
848 irqreturn_t (*handler_fn)(struct irq_desc *desc, 845 irqreturn_t (*handler_fn)(struct irq_desc *desc,
@@ -854,8 +851,6 @@ static int irq_thread(void *data)
854 else 851 else
855 handler_fn = irq_thread_fn; 852 handler_fn = irq_thread_fn;
856 853
857 sched_setscheduler(current, SCHED_FIFO, &param);
858
859 init_task_work(&on_exit_work, irq_thread_dtor); 854 init_task_work(&on_exit_work, irq_thread_dtor);
860 task_work_add(current, &on_exit_work, false); 855 task_work_add(current, &on_exit_work, false);
861 856
@@ -950,6 +945,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
950 */ 945 */
951 if (new->thread_fn && !nested) { 946 if (new->thread_fn && !nested) {
952 struct task_struct *t; 947 struct task_struct *t;
948 static const struct sched_param param = {
949 .sched_priority = MAX_USER_RT_PRIO/2,
950 };
953 951
954 t = kthread_create(irq_thread, new, "irq/%d-%s", irq, 952 t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
955 new->name); 953 new->name);
@@ -957,6 +955,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
957 ret = PTR_ERR(t); 955 ret = PTR_ERR(t);
958 goto out_mput; 956 goto out_mput;
959 } 957 }
958
959 sched_setscheduler(t, SCHED_FIFO, &param);
960
960 /* 961 /*
961 * We keep the reference to the task struct even if 962 * We keep the reference to the task struct even if
962 * the thread dies to avoid that the interrupt code 963 * the thread dies to avoid that the interrupt code
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 19ed5c425c3b..36f6ee181b0c 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -462,6 +462,8 @@ int show_interrupts(struct seq_file *p, void *v)
462 } else { 462 } else {
463 seq_printf(p, " %8s", "None"); 463 seq_printf(p, " %8s", "None");
464 } 464 }
465 if (desc->irq_data.domain)
466 seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);
465#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL 467#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
466 seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); 468 seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
467#endif 469#endif
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8241906c4b61..fb326365b694 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -147,6 +147,9 @@ int __request_module(bool wait, const char *fmt, ...)
147 */ 147 */
148 WARN_ON_ONCE(wait && current_is_async()); 148 WARN_ON_ONCE(wait && current_is_async());
149 149
150 if (!modprobe_path[0])
151 return 0;
152
150 va_start(args, fmt); 153 va_start(args, fmt);
151 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 154 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
152 va_end(args); 155 va_end(args);
@@ -569,14 +572,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
569 int retval = 0; 572 int retval = 0;
570 573
571 helper_lock(); 574 helper_lock();
572 if (!sub_info->path) {
573 retval = -EINVAL;
574 goto out;
575 }
576
577 if (sub_info->path[0] == '\0')
578 goto out;
579
580 if (!khelper_wq || usermodehelper_disabled) { 575 if (!khelper_wq || usermodehelper_disabled) {
581 retval = -EBUSY; 576 retval = -EBUSY;
582 goto out; 577 goto out;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index bddf3b201a48..6e33498d665c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2332,6 +2332,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
2332 if (copy_from_user(buf, user_buf, buf_size)) 2332 if (copy_from_user(buf, user_buf, buf_size))
2333 return -EFAULT; 2333 return -EFAULT;
2334 2334
2335 buf[buf_size] = '\0';
2335 switch (buf[0]) { 2336 switch (buf[0]) {
2336 case 'y': 2337 case 'y':
2337 case 'Y': 2338 case 'Y':
@@ -2343,6 +2344,8 @@ static ssize_t write_enabled_file_bool(struct file *file,
2343 case '0': 2344 case '0':
2344 disarm_all_kprobes(); 2345 disarm_all_kprobes();
2345 break; 2346 break;
2347 default:
2348 return -EINVAL;
2346 } 2349 }
2347 2350
2348 return count; 2351 return count;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1f3186b37fd5..e16c45b9ee77 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -4090,7 +4090,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
4090} 4090}
4091EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); 4091EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
4092 4092
4093static void print_held_locks_bug(struct task_struct *curr) 4093static void print_held_locks_bug(void)
4094{ 4094{
4095 if (!debug_locks_off()) 4095 if (!debug_locks_off())
4096 return; 4096 return;
@@ -4099,22 +4099,21 @@ static void print_held_locks_bug(struct task_struct *curr)
4099 4099
4100 printk("\n"); 4100 printk("\n");
4101 printk("=====================================\n"); 4101 printk("=====================================\n");
4102 printk("[ BUG: lock held at task exit time! ]\n"); 4102 printk("[ BUG: %s/%d still has locks held! ]\n",
4103 current->comm, task_pid_nr(current));
4103 print_kernel_ident(); 4104 print_kernel_ident();
4104 printk("-------------------------------------\n"); 4105 printk("-------------------------------------\n");
4105 printk("%s/%d is exiting with locks still held!\n", 4106 lockdep_print_held_locks(current);
4106 curr->comm, task_pid_nr(curr));
4107 lockdep_print_held_locks(curr);
4108
4109 printk("\nstack backtrace:\n"); 4107 printk("\nstack backtrace:\n");
4110 dump_stack(); 4108 dump_stack();
4111} 4109}
4112 4110
4113void debug_check_no_locks_held(struct task_struct *task) 4111void debug_check_no_locks_held(void)
4114{ 4112{
4115 if (unlikely(task->lockdep_depth > 0)) 4113 if (unlikely(current->lockdep_depth > 0))
4116 print_held_locks_bug(task); 4114 print_held_locks_bug();
4117} 4115}
4116EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
4118 4117
4119void debug_show_all_locks(void) 4118void debug_show_all_locks(void)
4120{ 4119{
diff --git a/kernel/module.c b/kernel/module.c
index cab4bce49c23..206915830d29 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -455,7 +455,7 @@ const struct kernel_symbol *find_symbol(const char *name,
455EXPORT_SYMBOL_GPL(find_symbol); 455EXPORT_SYMBOL_GPL(find_symbol);
456 456
457/* Search for module by name: must hold module_mutex. */ 457/* Search for module by name: must hold module_mutex. */
458static struct module *find_module_all(const char *name, 458static struct module *find_module_all(const char *name, size_t len,
459 bool even_unformed) 459 bool even_unformed)
460{ 460{
461 struct module *mod; 461 struct module *mod;
@@ -463,7 +463,7 @@ static struct module *find_module_all(const char *name,
463 list_for_each_entry(mod, &modules, list) { 463 list_for_each_entry(mod, &modules, list) {
464 if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) 464 if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
465 continue; 465 continue;
466 if (strcmp(mod->name, name) == 0) 466 if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
467 return mod; 467 return mod;
468 } 468 }
469 return NULL; 469 return NULL;
@@ -471,7 +471,7 @@ static struct module *find_module_all(const char *name,
471 471
472struct module *find_module(const char *name) 472struct module *find_module(const char *name)
473{ 473{
474 return find_module_all(name, false); 474 return find_module_all(name, strlen(name), false);
475} 475}
476EXPORT_SYMBOL_GPL(find_module); 476EXPORT_SYMBOL_GPL(find_module);
477 477
@@ -482,23 +482,28 @@ static inline void __percpu *mod_percpu(struct module *mod)
482 return mod->percpu; 482 return mod->percpu;
483} 483}
484 484
485static int percpu_modalloc(struct module *mod, 485static int percpu_modalloc(struct module *mod, struct load_info *info)
486 unsigned long size, unsigned long align)
487{ 486{
487 Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
488 unsigned long align = pcpusec->sh_addralign;
489
490 if (!pcpusec->sh_size)
491 return 0;
492
488 if (align > PAGE_SIZE) { 493 if (align > PAGE_SIZE) {
489 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", 494 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
490 mod->name, align, PAGE_SIZE); 495 mod->name, align, PAGE_SIZE);
491 align = PAGE_SIZE; 496 align = PAGE_SIZE;
492 } 497 }
493 498
494 mod->percpu = __alloc_reserved_percpu(size, align); 499 mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
495 if (!mod->percpu) { 500 if (!mod->percpu) {
496 printk(KERN_WARNING 501 printk(KERN_WARNING
497 "%s: Could not allocate %lu bytes percpu data\n", 502 "%s: Could not allocate %lu bytes percpu data\n",
498 mod->name, size); 503 mod->name, (unsigned long)pcpusec->sh_size);
499 return -ENOMEM; 504 return -ENOMEM;
500 } 505 }
501 mod->percpu_size = size; 506 mod->percpu_size = pcpusec->sh_size;
502 return 0; 507 return 0;
503} 508}
504 509
@@ -563,10 +568,12 @@ static inline void __percpu *mod_percpu(struct module *mod)
563{ 568{
564 return NULL; 569 return NULL;
565} 570}
566static inline int percpu_modalloc(struct module *mod, 571static int percpu_modalloc(struct module *mod, struct load_info *info)
567 unsigned long size, unsigned long align)
568{ 572{
569 return -ENOMEM; 573 /* UP modules shouldn't have this section: ENOMEM isn't quite right */
574 if (info->sechdrs[info->index.pcpu].sh_size != 0)
575 return -ENOMEM;
576 return 0;
570} 577}
571static inline void percpu_modfree(struct module *mod) 578static inline void percpu_modfree(struct module *mod)
572{ 579{
@@ -2927,7 +2934,6 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
2927{ 2934{
2928 /* Module within temporary copy. */ 2935 /* Module within temporary copy. */
2929 struct module *mod; 2936 struct module *mod;
2930 Elf_Shdr *pcpusec;
2931 int err; 2937 int err;
2932 2938
2933 mod = setup_load_info(info, flags); 2939 mod = setup_load_info(info, flags);
@@ -2942,17 +2948,10 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
2942 err = module_frob_arch_sections(info->hdr, info->sechdrs, 2948 err = module_frob_arch_sections(info->hdr, info->sechdrs,
2943 info->secstrings, mod); 2949 info->secstrings, mod);
2944 if (err < 0) 2950 if (err < 0)
2945 goto out; 2951 return ERR_PTR(err);
2946 2952
2947 pcpusec = &info->sechdrs[info->index.pcpu]; 2953 /* We will do a special allocation for per-cpu sections later. */
2948 if (pcpusec->sh_size) { 2954 info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
2949 /* We have a special allocation for this section. */
2950 err = percpu_modalloc(mod,
2951 pcpusec->sh_size, pcpusec->sh_addralign);
2952 if (err)
2953 goto out;
2954 pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC;
2955 }
2956 2955
2957 /* Determine total sizes, and put offsets in sh_entsize. For now 2956 /* Determine total sizes, and put offsets in sh_entsize. For now
2958 this is done generically; there doesn't appear to be any 2957 this is done generically; there doesn't appear to be any
@@ -2963,17 +2962,12 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
2963 /* Allocate and move to the final place */ 2962 /* Allocate and move to the final place */
2964 err = move_module(mod, info); 2963 err = move_module(mod, info);
2965 if (err) 2964 if (err)
2966 goto free_percpu; 2965 return ERR_PTR(err);
2967 2966
2968 /* Module has been copied to its final place now: return it. */ 2967 /* Module has been copied to its final place now: return it. */
2969 mod = (void *)info->sechdrs[info->index.mod].sh_addr; 2968 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2970 kmemleak_load_module(mod, info); 2969 kmemleak_load_module(mod, info);
2971 return mod; 2970 return mod;
2972
2973free_percpu:
2974 percpu_modfree(mod);
2975out:
2976 return ERR_PTR(err);
2977} 2971}
2978 2972
2979/* mod is no longer valid after this! */ 2973/* mod is no longer valid after this! */
@@ -3014,7 +3008,7 @@ static bool finished_loading(const char *name)
3014 bool ret; 3008 bool ret;
3015 3009
3016 mutex_lock(&module_mutex); 3010 mutex_lock(&module_mutex);
3017 mod = find_module_all(name, true); 3011 mod = find_module_all(name, strlen(name), true);
3018 ret = !mod || mod->state == MODULE_STATE_LIVE 3012 ret = !mod || mod->state == MODULE_STATE_LIVE
3019 || mod->state == MODULE_STATE_GOING; 3013 || mod->state == MODULE_STATE_GOING;
3020 mutex_unlock(&module_mutex); 3014 mutex_unlock(&module_mutex);
@@ -3152,7 +3146,8 @@ static int add_unformed_module(struct module *mod)
3152 3146
3153again: 3147again:
3154 mutex_lock(&module_mutex); 3148 mutex_lock(&module_mutex);
3155 if ((old = find_module_all(mod->name, true)) != NULL) { 3149 old = find_module_all(mod->name, strlen(mod->name), true);
3150 if (old != NULL) {
3156 if (old->state == MODULE_STATE_COMING 3151 if (old->state == MODULE_STATE_COMING
3157 || old->state == MODULE_STATE_UNFORMED) { 3152 || old->state == MODULE_STATE_UNFORMED) {
3158 /* Wait in case it fails to load. */ 3153 /* Wait in case it fails to load. */
@@ -3198,6 +3193,17 @@ out:
3198 return err; 3193 return err;
3199} 3194}
3200 3195
3196static int unknown_module_param_cb(char *param, char *val, const char *modname)
3197{
3198 /* Check for magic 'dyndbg' arg */
3199 int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
3200 if (ret != 0) {
3201 printk(KERN_WARNING "%s: unknown parameter '%s' ignored\n",
3202 modname, param);
3203 }
3204 return 0;
3205}
3206
3201/* Allocate and load the module: note that size of section 0 is always 3207/* Allocate and load the module: note that size of section 0 is always
3202 zero, and we rely on this for optional sections. */ 3208 zero, and we rely on this for optional sections. */
3203static int load_module(struct load_info *info, const char __user *uargs, 3209static int load_module(struct load_info *info, const char __user *uargs,
@@ -3237,6 +3243,11 @@ static int load_module(struct load_info *info, const char __user *uargs,
3237 } 3243 }
3238#endif 3244#endif
3239 3245
3246 /* To avoid stressing percpu allocator, do this once we're unique. */
3247 err = percpu_modalloc(mod, info);
3248 if (err)
3249 goto unlink_mod;
3250
3240 /* Now module is in final location, initialize linked lists, etc. */ 3251 /* Now module is in final location, initialize linked lists, etc. */
3241 err = module_unload_init(mod); 3252 err = module_unload_init(mod);
3242 if (err) 3253 if (err)
@@ -3284,7 +3295,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
3284 3295
3285 /* Module is ready to execute: parsing args may do that. */ 3296 /* Module is ready to execute: parsing args may do that. */
3286 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, 3297 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
3287 -32768, 32767, &ddebug_dyndbg_module_param_cb); 3298 -32768, 32767, unknown_module_param_cb);
3288 if (err < 0) 3299 if (err < 0)
3289 goto bug_cleanup; 3300 goto bug_cleanup;
3290 3301
@@ -3563,10 +3574,8 @@ unsigned long module_kallsyms_lookup_name(const char *name)
3563 /* Don't lock: we're in enough trouble already. */ 3574 /* Don't lock: we're in enough trouble already. */
3564 preempt_disable(); 3575 preempt_disable();
3565 if ((colon = strchr(name, ':')) != NULL) { 3576 if ((colon = strchr(name, ':')) != NULL) {
3566 *colon = '\0'; 3577 if ((mod = find_module_all(name, colon - name, false)) != NULL)
3567 if ((mod = find_module(name)) != NULL)
3568 ret = mod_find_symname(mod, colon+1); 3578 ret = mod_find_symname(mod, colon+1);
3569 *colon = ':';
3570 } else { 3579 } else {
3571 list_for_each_entry_rcu(mod, &modules, list) { 3580 list_for_each_entry_rcu(mod, &modules, list) {
3572 if (mod->state == MODULE_STATE_UNFORMED) 3581 if (mod->state == MODULE_STATE_UNFORMED)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index ad53a664f113..a52ee7bb830d 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -18,6 +18,7 @@
18 * Also see Documentation/mutex-design.txt. 18 * Also see Documentation/mutex-design.txt.
19 */ 19 */
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/ww_mutex.h>
21#include <linux/sched.h> 22#include <linux/sched.h>
22#include <linux/sched/rt.h> 23#include <linux/sched/rt.h>
23#include <linux/export.h> 24#include <linux/export.h>
@@ -254,16 +255,165 @@ void __sched mutex_unlock(struct mutex *lock)
254 255
255EXPORT_SYMBOL(mutex_unlock); 256EXPORT_SYMBOL(mutex_unlock);
256 257
258/**
259 * ww_mutex_unlock - release the w/w mutex
260 * @lock: the mutex to be released
261 *
262 * Unlock a mutex that has been locked by this task previously with any of the
263 * ww_mutex_lock* functions (with or without an acquire context). It is
264 * forbidden to release the locks after releasing the acquire context.
265 *
266 * This function must not be used in interrupt context. Unlocking
267 * of a unlocked mutex is not allowed.
268 */
269void __sched ww_mutex_unlock(struct ww_mutex *lock)
270{
271 /*
272 * The unlocking fastpath is the 0->1 transition from 'locked'
273 * into 'unlocked' state:
274 */
275 if (lock->ctx) {
276#ifdef CONFIG_DEBUG_MUTEXES
277 DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
278#endif
279 if (lock->ctx->acquired > 0)
280 lock->ctx->acquired--;
281 lock->ctx = NULL;
282 }
283
284#ifndef CONFIG_DEBUG_MUTEXES
285 /*
286 * When debugging is enabled we must not clear the owner before time,
287 * the slow path will always be taken, and that clears the owner field
288 * after verifying that it was indeed current.
289 */
290 mutex_clear_owner(&lock->base);
291#endif
292 __mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath);
293}
294EXPORT_SYMBOL(ww_mutex_unlock);
295
296static inline int __sched
297__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
298{
299 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
300 struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
301
302 if (!hold_ctx)
303 return 0;
304
305 if (unlikely(ctx == hold_ctx))
306 return -EALREADY;
307
308 if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
309 (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
310#ifdef CONFIG_DEBUG_MUTEXES
311 DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
312 ctx->contending_lock = ww;
313#endif
314 return -EDEADLK;
315 }
316
317 return 0;
318}
319
320static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
321 struct ww_acquire_ctx *ww_ctx)
322{
323#ifdef CONFIG_DEBUG_MUTEXES
324 /*
325 * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
326 * but released with a normal mutex_unlock in this call.
327 *
328 * This should never happen, always use ww_mutex_unlock.
329 */
330 DEBUG_LOCKS_WARN_ON(ww->ctx);
331
332 /*
333 * Not quite done after calling ww_acquire_done() ?
334 */
335 DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
336
337 if (ww_ctx->contending_lock) {
338 /*
339 * After -EDEADLK you tried to
340 * acquire a different ww_mutex? Bad!
341 */
342 DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
343
344 /*
345 * You called ww_mutex_lock after receiving -EDEADLK,
346 * but 'forgot' to unlock everything else first?
347 */
348 DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
349 ww_ctx->contending_lock = NULL;
350 }
351
352 /*
353 * Naughty, using a different class will lead to undefined behavior!
354 */
355 DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
356#endif
357 ww_ctx->acquired++;
358}
359
360/*
361 * after acquiring lock with fastpath or when we lost out in contested
362 * slowpath, set ctx and wake up any waiters so they can recheck.
363 *
364 * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
365 * as the fastpath and opportunistic spinning are disabled in that case.
366 */
367static __always_inline void
368ww_mutex_set_context_fastpath(struct ww_mutex *lock,
369 struct ww_acquire_ctx *ctx)
370{
371 unsigned long flags;
372 struct mutex_waiter *cur;
373
374 ww_mutex_lock_acquired(lock, ctx);
375
376 lock->ctx = ctx;
377
378 /*
379 * The lock->ctx update should be visible on all cores before
380 * the atomic read is done, otherwise contended waiters might be
381 * missed. The contended waiters will either see ww_ctx == NULL
382 * and keep spinning, or it will acquire wait_lock, add itself
383 * to waiter list and sleep.
384 */
385 smp_mb(); /* ^^^ */
386
387 /*
388 * Check if lock is contended, if not there is nobody to wake up
389 */
390 if (likely(atomic_read(&lock->base.count) == 0))
391 return;
392
393 /*
394 * Uh oh, we raced in fastpath, wake up everyone in this case,
395 * so they can see the new lock->ctx.
396 */
397 spin_lock_mutex(&lock->base.wait_lock, flags);
398 list_for_each_entry(cur, &lock->base.wait_list, list) {
399 debug_mutex_wake_waiter(&lock->base, cur);
400 wake_up_process(cur->task);
401 }
402 spin_unlock_mutex(&lock->base.wait_lock, flags);
403}
404
257/* 405/*
258 * Lock a mutex (possibly interruptible), slowpath: 406 * Lock a mutex (possibly interruptible), slowpath:
259 */ 407 */
260static inline int __sched 408static __always_inline int __sched
261__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, 409__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
262 struct lockdep_map *nest_lock, unsigned long ip) 410 struct lockdep_map *nest_lock, unsigned long ip,
411 struct ww_acquire_ctx *ww_ctx)
263{ 412{
264 struct task_struct *task = current; 413 struct task_struct *task = current;
265 struct mutex_waiter waiter; 414 struct mutex_waiter waiter;
266 unsigned long flags; 415 unsigned long flags;
416 int ret;
267 417
268 preempt_disable(); 418 preempt_disable();
269 mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); 419 mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
@@ -298,6 +448,22 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
298 struct task_struct *owner; 448 struct task_struct *owner;
299 struct mspin_node node; 449 struct mspin_node node;
300 450
451 if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) {
452 struct ww_mutex *ww;
453
454 ww = container_of(lock, struct ww_mutex, base);
455 /*
456 * If ww->ctx is set the contents are undefined, only
457 * by acquiring wait_lock there is a guarantee that
458 * they are not invalid when reading.
459 *
460 * As such, when deadlock detection needs to be
461 * performed the optimistic spinning cannot be done.
462 */
463 if (ACCESS_ONCE(ww->ctx))
464 break;
465 }
466
301 /* 467 /*
302 * If there's an owner, wait for it to either 468 * If there's an owner, wait for it to either
303 * release the lock or go to sleep. 469 * release the lock or go to sleep.
@@ -312,6 +478,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
312 if ((atomic_read(&lock->count) == 1) && 478 if ((atomic_read(&lock->count) == 1) &&
313 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { 479 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
314 lock_acquired(&lock->dep_map, ip); 480 lock_acquired(&lock->dep_map, ip);
481 if (!__builtin_constant_p(ww_ctx == NULL)) {
482 struct ww_mutex *ww;
483 ww = container_of(lock, struct ww_mutex, base);
484
485 ww_mutex_set_context_fastpath(ww, ww_ctx);
486 }
487
315 mutex_set_owner(lock); 488 mutex_set_owner(lock);
316 mspin_unlock(MLOCK(lock), &node); 489 mspin_unlock(MLOCK(lock), &node);
317 preempt_enable(); 490 preempt_enable();
@@ -371,15 +544,16 @@ slowpath:
371 * TASK_UNINTERRUPTIBLE case.) 544 * TASK_UNINTERRUPTIBLE case.)
372 */ 545 */
373 if (unlikely(signal_pending_state(state, task))) { 546 if (unlikely(signal_pending_state(state, task))) {
374 mutex_remove_waiter(lock, &waiter, 547 ret = -EINTR;
375 task_thread_info(task)); 548 goto err;
376 mutex_release(&lock->dep_map, 1, ip); 549 }
377 spin_unlock_mutex(&lock->wait_lock, flags);
378 550
379 debug_mutex_free_waiter(&waiter); 551 if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) {
380 preempt_enable(); 552 ret = __mutex_lock_check_stamp(lock, ww_ctx);
381 return -EINTR; 553 if (ret)
554 goto err;
382 } 555 }
556
383 __set_task_state(task, state); 557 __set_task_state(task, state);
384 558
385 /* didn't get the lock, go to sleep: */ 559 /* didn't get the lock, go to sleep: */
@@ -394,6 +568,30 @@ done:
394 mutex_remove_waiter(lock, &waiter, current_thread_info()); 568 mutex_remove_waiter(lock, &waiter, current_thread_info());
395 mutex_set_owner(lock); 569 mutex_set_owner(lock);
396 570
571 if (!__builtin_constant_p(ww_ctx == NULL)) {
572 struct ww_mutex *ww = container_of(lock,
573 struct ww_mutex,
574 base);
575 struct mutex_waiter *cur;
576
577 /*
578 * This branch gets optimized out for the common case,
579 * and is only important for ww_mutex_lock.
580 */
581
582 ww_mutex_lock_acquired(ww, ww_ctx);
583 ww->ctx = ww_ctx;
584
585 /*
586 * Give any possible sleeping processes the chance to wake up,
587 * so they can recheck if they have to back off.
588 */
589 list_for_each_entry(cur, &lock->wait_list, list) {
590 debug_mutex_wake_waiter(lock, cur);
591 wake_up_process(cur->task);
592 }
593 }
594
397 /* set it to 0 if there are no waiters left: */ 595 /* set it to 0 if there are no waiters left: */
398 if (likely(list_empty(&lock->wait_list))) 596 if (likely(list_empty(&lock->wait_list)))
399 atomic_set(&lock->count, 0); 597 atomic_set(&lock->count, 0);
@@ -404,6 +602,14 @@ done:
404 preempt_enable(); 602 preempt_enable();
405 603
406 return 0; 604 return 0;
605
606err:
607 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
608 spin_unlock_mutex(&lock->wait_lock, flags);
609 debug_mutex_free_waiter(&waiter);
610 mutex_release(&lock->dep_map, 1, ip);
611 preempt_enable();
612 return ret;
407} 613}
408 614
409#ifdef CONFIG_DEBUG_LOCK_ALLOC 615#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -411,7 +617,8 @@ void __sched
411mutex_lock_nested(struct mutex *lock, unsigned int subclass) 617mutex_lock_nested(struct mutex *lock, unsigned int subclass)
412{ 618{
413 might_sleep(); 619 might_sleep();
414 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); 620 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
621 subclass, NULL, _RET_IP_, NULL);
415} 622}
416 623
417EXPORT_SYMBOL_GPL(mutex_lock_nested); 624EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -420,7 +627,8 @@ void __sched
420_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) 627_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
421{ 628{
422 might_sleep(); 629 might_sleep();
423 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); 630 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
631 0, nest, _RET_IP_, NULL);
424} 632}
425 633
426EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); 634EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
@@ -429,7 +637,8 @@ int __sched
429mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) 637mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
430{ 638{
431 might_sleep(); 639 might_sleep();
432 return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); 640 return __mutex_lock_common(lock, TASK_KILLABLE,
641 subclass, NULL, _RET_IP_, NULL);
433} 642}
434EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); 643EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
435 644
@@ -438,10 +647,68 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
438{ 647{
439 might_sleep(); 648 might_sleep();
440 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 649 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
441 subclass, NULL, _RET_IP_); 650 subclass, NULL, _RET_IP_, NULL);
442} 651}
443 652
444EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); 653EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
654
655static inline int
656ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
657{
658#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
659 unsigned tmp;
660
661 if (ctx->deadlock_inject_countdown-- == 0) {
662 tmp = ctx->deadlock_inject_interval;
663 if (tmp > UINT_MAX/4)
664 tmp = UINT_MAX;
665 else
666 tmp = tmp*2 + tmp + tmp/2;
667
668 ctx->deadlock_inject_interval = tmp;
669 ctx->deadlock_inject_countdown = tmp;
670 ctx->contending_lock = lock;
671
672 ww_mutex_unlock(lock);
673
674 return -EDEADLK;
675 }
676#endif
677
678 return 0;
679}
680
681int __sched
682__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
683{
684 int ret;
685
686 might_sleep();
687 ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
688 0, &ctx->dep_map, _RET_IP_, ctx);
689 if (!ret && ctx->acquired > 1)
690 return ww_mutex_deadlock_injection(lock, ctx);
691
692 return ret;
693}
694EXPORT_SYMBOL_GPL(__ww_mutex_lock);
695
696int __sched
697__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
698{
699 int ret;
700
701 might_sleep();
702 ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
703 0, &ctx->dep_map, _RET_IP_, ctx);
704
705 if (!ret && ctx->acquired > 1)
706 return ww_mutex_deadlock_injection(lock, ctx);
707
708 return ret;
709}
710EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
711
445#endif 712#endif
446 713
447/* 714/*
@@ -494,10 +761,10 @@ __mutex_unlock_slowpath(atomic_t *lock_count)
494 * mutex_lock_interruptible() and mutex_trylock(). 761 * mutex_lock_interruptible() and mutex_trylock().
495 */ 762 */
496static noinline int __sched 763static noinline int __sched
497__mutex_lock_killable_slowpath(atomic_t *lock_count); 764__mutex_lock_killable_slowpath(struct mutex *lock);
498 765
499static noinline int __sched 766static noinline int __sched
500__mutex_lock_interruptible_slowpath(atomic_t *lock_count); 767__mutex_lock_interruptible_slowpath(struct mutex *lock);
501 768
502/** 769/**
503 * mutex_lock_interruptible - acquire the mutex, interruptible 770 * mutex_lock_interruptible - acquire the mutex, interruptible
@@ -515,12 +782,12 @@ int __sched mutex_lock_interruptible(struct mutex *lock)
515 int ret; 782 int ret;
516 783
517 might_sleep(); 784 might_sleep();
518 ret = __mutex_fastpath_lock_retval 785 ret = __mutex_fastpath_lock_retval(&lock->count);
519 (&lock->count, __mutex_lock_interruptible_slowpath); 786 if (likely(!ret)) {
520 if (!ret)
521 mutex_set_owner(lock); 787 mutex_set_owner(lock);
522 788 return 0;
523 return ret; 789 } else
790 return __mutex_lock_interruptible_slowpath(lock);
524} 791}
525 792
526EXPORT_SYMBOL(mutex_lock_interruptible); 793EXPORT_SYMBOL(mutex_lock_interruptible);
@@ -530,12 +797,12 @@ int __sched mutex_lock_killable(struct mutex *lock)
530 int ret; 797 int ret;
531 798
532 might_sleep(); 799 might_sleep();
533 ret = __mutex_fastpath_lock_retval 800 ret = __mutex_fastpath_lock_retval(&lock->count);
534 (&lock->count, __mutex_lock_killable_slowpath); 801 if (likely(!ret)) {
535 if (!ret)
536 mutex_set_owner(lock); 802 mutex_set_owner(lock);
537 803 return 0;
538 return ret; 804 } else
805 return __mutex_lock_killable_slowpath(lock);
539} 806}
540EXPORT_SYMBOL(mutex_lock_killable); 807EXPORT_SYMBOL(mutex_lock_killable);
541 808
@@ -544,24 +811,39 @@ __mutex_lock_slowpath(atomic_t *lock_count)
544{ 811{
545 struct mutex *lock = container_of(lock_count, struct mutex, count); 812 struct mutex *lock = container_of(lock_count, struct mutex, count);
546 813
547 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); 814 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
815 NULL, _RET_IP_, NULL);
548} 816}
549 817
550static noinline int __sched 818static noinline int __sched
551__mutex_lock_killable_slowpath(atomic_t *lock_count) 819__mutex_lock_killable_slowpath(struct mutex *lock)
552{ 820{
553 struct mutex *lock = container_of(lock_count, struct mutex, count); 821 return __mutex_lock_common(lock, TASK_KILLABLE, 0,
822 NULL, _RET_IP_, NULL);
823}
554 824
555 return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); 825static noinline int __sched
826__mutex_lock_interruptible_slowpath(struct mutex *lock)
827{
828 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
829 NULL, _RET_IP_, NULL);
556} 830}
557 831
558static noinline int __sched 832static noinline int __sched
559__mutex_lock_interruptible_slowpath(atomic_t *lock_count) 833__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
560{ 834{
561 struct mutex *lock = container_of(lock_count, struct mutex, count); 835 return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
836 NULL, _RET_IP_, ctx);
837}
562 838
563 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); 839static noinline int __sched
840__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
841 struct ww_acquire_ctx *ctx)
842{
843 return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0,
844 NULL, _RET_IP_, ctx);
564} 845}
846
565#endif 847#endif
566 848
567/* 849/*
@@ -617,6 +899,45 @@ int __sched mutex_trylock(struct mutex *lock)
617} 899}
618EXPORT_SYMBOL(mutex_trylock); 900EXPORT_SYMBOL(mutex_trylock);
619 901
902#ifndef CONFIG_DEBUG_LOCK_ALLOC
903int __sched
904__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
905{
906 int ret;
907
908 might_sleep();
909
910 ret = __mutex_fastpath_lock_retval(&lock->base.count);
911
912 if (likely(!ret)) {
913 ww_mutex_set_context_fastpath(lock, ctx);
914 mutex_set_owner(&lock->base);
915 } else
916 ret = __ww_mutex_lock_slowpath(lock, ctx);
917 return ret;
918}
919EXPORT_SYMBOL(__ww_mutex_lock);
920
921int __sched
922__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
923{
924 int ret;
925
926 might_sleep();
927
928 ret = __mutex_fastpath_lock_retval(&lock->base.count);
929
930 if (likely(!ret)) {
931 ww_mutex_set_context_fastpath(lock, ctx);
932 mutex_set_owner(&lock->base);
933 } else
934 ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx);
935 return ret;
936}
937EXPORT_SYMBOL(__ww_mutex_lock_interruptible);
938
939#endif
940
620/** 941/**
621 * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 942 * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
622 * @cnt: the atomic which we are to dec 943 * @cnt: the atomic which we are to dec
diff --git a/kernel/panic.c b/kernel/panic.c
index 167ec097ce8b..801864600514 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -15,6 +15,7 @@
15#include <linux/notifier.h> 15#include <linux/notifier.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/random.h> 17#include <linux/random.h>
18#include <linux/ftrace.h>
18#include <linux/reboot.h> 19#include <linux/reboot.h>
19#include <linux/delay.h> 20#include <linux/delay.h>
20#include <linux/kexec.h> 21#include <linux/kexec.h>
@@ -399,8 +400,11 @@ struct slowpath_args {
399static void warn_slowpath_common(const char *file, int line, void *caller, 400static void warn_slowpath_common(const char *file, int line, void *caller,
400 unsigned taint, struct slowpath_args *args) 401 unsigned taint, struct slowpath_args *args)
401{ 402{
402 printk(KERN_WARNING "------------[ cut here ]------------\n"); 403 disable_trace_on_warning();
403 printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); 404
405 pr_warn("------------[ cut here ]------------\n");
406 pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n",
407 raw_smp_processor_id(), current->pid, file, line, caller);
404 408
405 if (args) 409 if (args)
406 vprintk(args->fmt, args->args); 410 vprintk(args->fmt, args->args);
diff --git a/kernel/params.c b/kernel/params.c
index 53b958fcd639..440e65d1a544 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -787,7 +787,7 @@ static void __init kernel_add_sysfs_param(const char *name,
787} 787}
788 788
789/* 789/*
790 * param_sysfs_builtin - add contents in /sys/parameters for built-in modules 790 * param_sysfs_builtin - add sysfs parameters for built-in modules
791 * 791 *
792 * Add module_parameters to sysfs for "modules" built into the kernel. 792 * Add module_parameters to sysfs for "modules" built into the kernel.
793 * 793 *
diff --git a/kernel/pid.c b/kernel/pid.c
index 0db3e791a06d..66505c1dfc51 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -75,6 +75,7 @@ struct pid_namespace init_pid_ns = {
75 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } 75 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
76 }, 76 },
77 .last_pid = 0, 77 .last_pid = 0,
78 .nr_hashed = PIDNS_HASH_ADDING,
78 .level = 0, 79 .level = 0,
79 .child_reaper = &init_task, 80 .child_reaper = &init_task,
80 .user_ns = &init_user_ns, 81 .user_ns = &init_user_ns,
@@ -373,14 +374,10 @@ EXPORT_SYMBOL_GPL(find_vpid);
373/* 374/*
374 * attach_pid() must be called with the tasklist_lock write-held. 375 * attach_pid() must be called with the tasklist_lock write-held.
375 */ 376 */
376void attach_pid(struct task_struct *task, enum pid_type type, 377void attach_pid(struct task_struct *task, enum pid_type type)
377 struct pid *pid)
378{ 378{
379 struct pid_link *link; 379 struct pid_link *link = &task->pids[type];
380 380 hlist_add_head_rcu(&link->node, &link->pid->tasks[type]);
381 link = &task->pids[type];
382 link->pid = pid;
383 hlist_add_head_rcu(&link->node, &pid->tasks[type]);
384} 381}
385 382
386static void __change_pid(struct task_struct *task, enum pid_type type, 383static void __change_pid(struct task_struct *task, enum pid_type type,
@@ -412,7 +409,7 @@ void change_pid(struct task_struct *task, enum pid_type type,
412 struct pid *pid) 409 struct pid *pid)
413{ 410{
414 __change_pid(task, type, pid); 411 __change_pid(task, type, pid);
415 attach_pid(task, type, pid); 412 attach_pid(task, type);
416} 413}
417 414
418/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ 415/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
@@ -594,7 +591,6 @@ void __init pidmap_init(void)
594 /* Reserve PID 0. We never call free_pidmap(0) */ 591 /* Reserve PID 0. We never call free_pidmap(0) */
595 set_bit(0, init_pid_ns.pidmap[0].page); 592 set_bit(0, init_pid_ns.pidmap[0].page);
596 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 593 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
597 init_pid_ns.nr_hashed = PIDNS_HASH_ADDING;
598 594
599 init_pid_ns.pid_cachep = KMEM_CACHE(pid, 595 init_pid_ns.pid_cachep = KMEM_CACHE(pid,
600 SLAB_HWCACHE_ALIGN | SLAB_PANIC); 596 SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 42670e9b44e0..c7f31aa272f7 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -51,59 +51,28 @@ static int check_clock(const clockid_t which_clock)
51 return error; 51 return error;
52} 52}
53 53
54static inline union cpu_time_count 54static inline unsigned long long
55timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) 55timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
56{ 56{
57 union cpu_time_count ret; 57 unsigned long long ret;
58 ret.sched = 0; /* high half always zero when .cpu used */ 58
59 ret = 0; /* high half always zero when .cpu used */
59 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 60 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
60 ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; 61 ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
61 } else { 62 } else {
62 ret.cpu = timespec_to_cputime(tp); 63 ret = cputime_to_expires(timespec_to_cputime(tp));
63 } 64 }
64 return ret; 65 return ret;
65} 66}
66 67
67static void sample_to_timespec(const clockid_t which_clock, 68static void sample_to_timespec(const clockid_t which_clock,
68 union cpu_time_count cpu, 69 unsigned long long expires,
69 struct timespec *tp) 70 struct timespec *tp)
70{ 71{
71 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) 72 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
72 *tp = ns_to_timespec(cpu.sched); 73 *tp = ns_to_timespec(expires);
73 else 74 else
74 cputime_to_timespec(cpu.cpu, tp); 75 cputime_to_timespec((__force cputime_t)expires, tp);
75}
76
77static inline int cpu_time_before(const clockid_t which_clock,
78 union cpu_time_count now,
79 union cpu_time_count then)
80{
81 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
82 return now.sched < then.sched;
83 } else {
84 return now.cpu < then.cpu;
85 }
86}
87static inline void cpu_time_add(const clockid_t which_clock,
88 union cpu_time_count *acc,
89 union cpu_time_count val)
90{
91 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
92 acc->sched += val.sched;
93 } else {
94 acc->cpu += val.cpu;
95 }
96}
97static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
98 union cpu_time_count a,
99 union cpu_time_count b)
100{
101 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
102 a.sched -= b.sched;
103 } else {
104 a.cpu -= b.cpu;
105 }
106 return a;
107} 76}
108 77
109/* 78/*
@@ -111,47 +80,31 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
111 * given the current clock sample. 80 * given the current clock sample.
112 */ 81 */
113static void bump_cpu_timer(struct k_itimer *timer, 82static void bump_cpu_timer(struct k_itimer *timer,
114 union cpu_time_count now) 83 unsigned long long now)
115{ 84{
116 int i; 85 int i;
86 unsigned long long delta, incr;
117 87
118 if (timer->it.cpu.incr.sched == 0) 88 if (timer->it.cpu.incr == 0)
119 return; 89 return;
120 90
121 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { 91 if (now < timer->it.cpu.expires)
122 unsigned long long delta, incr; 92 return;
123 93
124 if (now.sched < timer->it.cpu.expires.sched) 94 incr = timer->it.cpu.incr;
125 return; 95 delta = now + incr - timer->it.cpu.expires;
126 incr = timer->it.cpu.incr.sched;
127 delta = now.sched + incr - timer->it.cpu.expires.sched;
128 /* Don't use (incr*2 < delta), incr*2 might overflow. */
129 for (i = 0; incr < delta - incr; i++)
130 incr = incr << 1;
131 for (; i >= 0; incr >>= 1, i--) {
132 if (delta < incr)
133 continue;
134 timer->it.cpu.expires.sched += incr;
135 timer->it_overrun += 1 << i;
136 delta -= incr;
137 }
138 } else {
139 cputime_t delta, incr;
140 96
141 if (now.cpu < timer->it.cpu.expires.cpu) 97 /* Don't use (incr*2 < delta), incr*2 might overflow. */
142 return; 98 for (i = 0; incr < delta - incr; i++)
143 incr = timer->it.cpu.incr.cpu; 99 incr = incr << 1;
144 delta = now.cpu + incr - timer->it.cpu.expires.cpu; 100
145 /* Don't use (incr*2 < delta), incr*2 might overflow. */ 101 for (; i >= 0; incr >>= 1, i--) {
146 for (i = 0; incr < delta - incr; i++) 102 if (delta < incr)
147 incr += incr; 103 continue;
148 for (; i >= 0; incr = incr >> 1, i--) { 104
149 if (delta < incr) 105 timer->it.cpu.expires += incr;
150 continue; 106 timer->it_overrun += 1 << i;
151 timer->it.cpu.expires.cpu += incr; 107 delta -= incr;
152 timer->it_overrun += 1 << i;
153 delta -= incr;
154 }
155 } 108 }
156} 109}
157 110
@@ -170,21 +123,21 @@ static inline int task_cputime_zero(const struct task_cputime *cputime)
170 return 0; 123 return 0;
171} 124}
172 125
173static inline cputime_t prof_ticks(struct task_struct *p) 126static inline unsigned long long prof_ticks(struct task_struct *p)
174{ 127{
175 cputime_t utime, stime; 128 cputime_t utime, stime;
176 129
177 task_cputime(p, &utime, &stime); 130 task_cputime(p, &utime, &stime);
178 131
179 return utime + stime; 132 return cputime_to_expires(utime + stime);
180} 133}
181static inline cputime_t virt_ticks(struct task_struct *p) 134static inline unsigned long long virt_ticks(struct task_struct *p)
182{ 135{
183 cputime_t utime; 136 cputime_t utime;
184 137
185 task_cputime(p, &utime, NULL); 138 task_cputime(p, &utime, NULL);
186 139
187 return utime; 140 return cputime_to_expires(utime);
188} 141}
189 142
190static int 143static int
@@ -225,19 +178,19 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
225 * Sample a per-thread clock for the given task. 178 * Sample a per-thread clock for the given task.
226 */ 179 */
227static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, 180static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
228 union cpu_time_count *cpu) 181 unsigned long long *sample)
229{ 182{
230 switch (CPUCLOCK_WHICH(which_clock)) { 183 switch (CPUCLOCK_WHICH(which_clock)) {
231 default: 184 default:
232 return -EINVAL; 185 return -EINVAL;
233 case CPUCLOCK_PROF: 186 case CPUCLOCK_PROF:
234 cpu->cpu = prof_ticks(p); 187 *sample = prof_ticks(p);
235 break; 188 break;
236 case CPUCLOCK_VIRT: 189 case CPUCLOCK_VIRT:
237 cpu->cpu = virt_ticks(p); 190 *sample = virt_ticks(p);
238 break; 191 break;
239 case CPUCLOCK_SCHED: 192 case CPUCLOCK_SCHED:
240 cpu->sched = task_sched_runtime(p); 193 *sample = task_sched_runtime(p);
241 break; 194 break;
242 } 195 }
243 return 0; 196 return 0;
@@ -284,7 +237,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
284 */ 237 */
285static int cpu_clock_sample_group(const clockid_t which_clock, 238static int cpu_clock_sample_group(const clockid_t which_clock,
286 struct task_struct *p, 239 struct task_struct *p,
287 union cpu_time_count *cpu) 240 unsigned long long *sample)
288{ 241{
289 struct task_cputime cputime; 242 struct task_cputime cputime;
290 243
@@ -293,15 +246,15 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
293 return -EINVAL; 246 return -EINVAL;
294 case CPUCLOCK_PROF: 247 case CPUCLOCK_PROF:
295 thread_group_cputime(p, &cputime); 248 thread_group_cputime(p, &cputime);
296 cpu->cpu = cputime.utime + cputime.stime; 249 *sample = cputime_to_expires(cputime.utime + cputime.stime);
297 break; 250 break;
298 case CPUCLOCK_VIRT: 251 case CPUCLOCK_VIRT:
299 thread_group_cputime(p, &cputime); 252 thread_group_cputime(p, &cputime);
300 cpu->cpu = cputime.utime; 253 *sample = cputime_to_expires(cputime.utime);
301 break; 254 break;
302 case CPUCLOCK_SCHED: 255 case CPUCLOCK_SCHED:
303 thread_group_cputime(p, &cputime); 256 thread_group_cputime(p, &cputime);
304 cpu->sched = cputime.sum_exec_runtime; 257 *sample = cputime.sum_exec_runtime;
305 break; 258 break;
306 } 259 }
307 return 0; 260 return 0;
@@ -312,7 +265,7 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
312{ 265{
313 const pid_t pid = CPUCLOCK_PID(which_clock); 266 const pid_t pid = CPUCLOCK_PID(which_clock);
314 int error = -EINVAL; 267 int error = -EINVAL;
315 union cpu_time_count rtn; 268 unsigned long long rtn;
316 269
317 if (pid == 0) { 270 if (pid == 0) {
318 /* 271 /*
@@ -446,6 +399,15 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
446 return ret; 399 return ret;
447} 400}
448 401
402static void cleanup_timers_list(struct list_head *head,
403 unsigned long long curr)
404{
405 struct cpu_timer_list *timer, *next;
406
407 list_for_each_entry_safe(timer, next, head, entry)
408 list_del_init(&timer->entry);
409}
410
449/* 411/*
450 * Clean out CPU timers still ticking when a thread exited. The task 412 * Clean out CPU timers still ticking when a thread exited. The task
451 * pointer is cleared, and the expiry time is replaced with the residual 413 * pointer is cleared, and the expiry time is replaced with the residual
@@ -456,37 +418,12 @@ static void cleanup_timers(struct list_head *head,
456 cputime_t utime, cputime_t stime, 418 cputime_t utime, cputime_t stime,
457 unsigned long long sum_exec_runtime) 419 unsigned long long sum_exec_runtime)
458{ 420{
459 struct cpu_timer_list *timer, *next;
460 cputime_t ptime = utime + stime;
461
462 list_for_each_entry_safe(timer, next, head, entry) {
463 list_del_init(&timer->entry);
464 if (timer->expires.cpu < ptime) {
465 timer->expires.cpu = 0;
466 } else {
467 timer->expires.cpu -= ptime;
468 }
469 }
470 421
471 ++head; 422 cputime_t ptime = utime + stime;
472 list_for_each_entry_safe(timer, next, head, entry) {
473 list_del_init(&timer->entry);
474 if (timer->expires.cpu < utime) {
475 timer->expires.cpu = 0;
476 } else {
477 timer->expires.cpu -= utime;
478 }
479 }
480 423
481 ++head; 424 cleanup_timers_list(head, cputime_to_expires(ptime));
482 list_for_each_entry_safe(timer, next, head, entry) { 425 cleanup_timers_list(++head, cputime_to_expires(utime));
483 list_del_init(&timer->entry); 426 cleanup_timers_list(++head, sum_exec_runtime);
484 if (timer->expires.sched < sum_exec_runtime) {
485 timer->expires.sched = 0;
486 } else {
487 timer->expires.sched -= sum_exec_runtime;
488 }
489 }
490} 427}
491 428
492/* 429/*
@@ -516,17 +453,21 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
516 tsk->se.sum_exec_runtime + sig->sum_sched_runtime); 453 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
517} 454}
518 455
519static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) 456static void clear_dead_task(struct k_itimer *itimer, unsigned long long now)
520{ 457{
458 struct cpu_timer_list *timer = &itimer->it.cpu;
459
521 /* 460 /*
522 * That's all for this thread or process. 461 * That's all for this thread or process.
523 * We leave our residual in expires to be reported. 462 * We leave our residual in expires to be reported.
524 */ 463 */
525 put_task_struct(timer->it.cpu.task); 464 put_task_struct(timer->task);
526 timer->it.cpu.task = NULL; 465 timer->task = NULL;
527 timer->it.cpu.expires = cpu_time_sub(timer->it_clock, 466 if (timer->expires < now) {
528 timer->it.cpu.expires, 467 timer->expires = 0;
529 now); 468 } else {
469 timer->expires -= now;
470 }
530} 471}
531 472
532static inline int expires_gt(cputime_t expires, cputime_t new_exp) 473static inline int expires_gt(cputime_t expires, cputime_t new_exp)
@@ -558,14 +499,14 @@ static void arm_timer(struct k_itimer *timer)
558 499
559 listpos = head; 500 listpos = head;
560 list_for_each_entry(next, head, entry) { 501 list_for_each_entry(next, head, entry) {
561 if (cpu_time_before(timer->it_clock, nt->expires, next->expires)) 502 if (nt->expires < next->expires)
562 break; 503 break;
563 listpos = &next->entry; 504 listpos = &next->entry;
564 } 505 }
565 list_add(&nt->entry, listpos); 506 list_add(&nt->entry, listpos);
566 507
567 if (listpos == head) { 508 if (listpos == head) {
568 union cpu_time_count *exp = &nt->expires; 509 unsigned long long exp = nt->expires;
569 510
570 /* 511 /*
571 * We are the new earliest-expiring POSIX 1.b timer, hence 512 * We are the new earliest-expiring POSIX 1.b timer, hence
@@ -576,17 +517,17 @@ static void arm_timer(struct k_itimer *timer)
576 517
577 switch (CPUCLOCK_WHICH(timer->it_clock)) { 518 switch (CPUCLOCK_WHICH(timer->it_clock)) {
578 case CPUCLOCK_PROF: 519 case CPUCLOCK_PROF:
579 if (expires_gt(cputime_expires->prof_exp, exp->cpu)) 520 if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp)))
580 cputime_expires->prof_exp = exp->cpu; 521 cputime_expires->prof_exp = expires_to_cputime(exp);
581 break; 522 break;
582 case CPUCLOCK_VIRT: 523 case CPUCLOCK_VIRT:
583 if (expires_gt(cputime_expires->virt_exp, exp->cpu)) 524 if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp)))
584 cputime_expires->virt_exp = exp->cpu; 525 cputime_expires->virt_exp = expires_to_cputime(exp);
585 break; 526 break;
586 case CPUCLOCK_SCHED: 527 case CPUCLOCK_SCHED:
587 if (cputime_expires->sched_exp == 0 || 528 if (cputime_expires->sched_exp == 0 ||
588 cputime_expires->sched_exp > exp->sched) 529 cputime_expires->sched_exp > exp)
589 cputime_expires->sched_exp = exp->sched; 530 cputime_expires->sched_exp = exp;
590 break; 531 break;
591 } 532 }
592 } 533 }
@@ -601,20 +542,20 @@ static void cpu_timer_fire(struct k_itimer *timer)
601 /* 542 /*
602 * User don't want any signal. 543 * User don't want any signal.
603 */ 544 */
604 timer->it.cpu.expires.sched = 0; 545 timer->it.cpu.expires = 0;
605 } else if (unlikely(timer->sigq == NULL)) { 546 } else if (unlikely(timer->sigq == NULL)) {
606 /* 547 /*
607 * This a special case for clock_nanosleep, 548 * This a special case for clock_nanosleep,
608 * not a normal timer from sys_timer_create. 549 * not a normal timer from sys_timer_create.
609 */ 550 */
610 wake_up_process(timer->it_process); 551 wake_up_process(timer->it_process);
611 timer->it.cpu.expires.sched = 0; 552 timer->it.cpu.expires = 0;
612 } else if (timer->it.cpu.incr.sched == 0) { 553 } else if (timer->it.cpu.incr == 0) {
613 /* 554 /*
614 * One-shot timer. Clear it as soon as it's fired. 555 * One-shot timer. Clear it as soon as it's fired.
615 */ 556 */
616 posix_timer_event(timer, 0); 557 posix_timer_event(timer, 0);
617 timer->it.cpu.expires.sched = 0; 558 timer->it.cpu.expires = 0;
618 } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { 559 } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
619 /* 560 /*
620 * The signal did not get queued because the signal 561 * The signal did not get queued because the signal
@@ -632,7 +573,7 @@ static void cpu_timer_fire(struct k_itimer *timer)
632 */ 573 */
633static int cpu_timer_sample_group(const clockid_t which_clock, 574static int cpu_timer_sample_group(const clockid_t which_clock,
634 struct task_struct *p, 575 struct task_struct *p,
635 union cpu_time_count *cpu) 576 unsigned long long *sample)
636{ 577{
637 struct task_cputime cputime; 578 struct task_cputime cputime;
638 579
@@ -641,13 +582,13 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
641 default: 582 default:
642 return -EINVAL; 583 return -EINVAL;
643 case CPUCLOCK_PROF: 584 case CPUCLOCK_PROF:
644 cpu->cpu = cputime.utime + cputime.stime; 585 *sample = cputime_to_expires(cputime.utime + cputime.stime);
645 break; 586 break;
646 case CPUCLOCK_VIRT: 587 case CPUCLOCK_VIRT:
647 cpu->cpu = cputime.utime; 588 *sample = cputime_to_expires(cputime.utime);
648 break; 589 break;
649 case CPUCLOCK_SCHED: 590 case CPUCLOCK_SCHED:
650 cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); 591 *sample = cputime.sum_exec_runtime + task_delta_exec(p);
651 break; 592 break;
652 } 593 }
653 return 0; 594 return 0;
@@ -694,7 +635,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
694 struct itimerspec *new, struct itimerspec *old) 635 struct itimerspec *new, struct itimerspec *old)
695{ 636{
696 struct task_struct *p = timer->it.cpu.task; 637 struct task_struct *p = timer->it.cpu.task;
697 union cpu_time_count old_expires, new_expires, old_incr, val; 638 unsigned long long old_expires, new_expires, old_incr, val;
698 int ret; 639 int ret;
699 640
700 if (unlikely(p == NULL)) { 641 if (unlikely(p == NULL)) {
@@ -749,7 +690,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
749 } 690 }
750 691
751 if (old) { 692 if (old) {
752 if (old_expires.sched == 0) { 693 if (old_expires == 0) {
753 old->it_value.tv_sec = 0; 694 old->it_value.tv_sec = 0;
754 old->it_value.tv_nsec = 0; 695 old->it_value.tv_nsec = 0;
755 } else { 696 } else {
@@ -764,11 +705,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
764 * new setting. 705 * new setting.
765 */ 706 */
766 bump_cpu_timer(timer, val); 707 bump_cpu_timer(timer, val);
767 if (cpu_time_before(timer->it_clock, val, 708 if (val < timer->it.cpu.expires) {
768 timer->it.cpu.expires)) { 709 old_expires = timer->it.cpu.expires - val;
769 old_expires = cpu_time_sub(
770 timer->it_clock,
771 timer->it.cpu.expires, val);
772 sample_to_timespec(timer->it_clock, 710 sample_to_timespec(timer->it_clock,
773 old_expires, 711 old_expires,
774 &old->it_value); 712 &old->it_value);
@@ -791,8 +729,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
791 goto out; 729 goto out;
792 } 730 }
793 731
794 if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) { 732 if (new_expires != 0 && !(flags & TIMER_ABSTIME)) {
795 cpu_time_add(timer->it_clock, &new_expires, val); 733 new_expires += val;
796 } 734 }
797 735
798 /* 736 /*
@@ -801,8 +739,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
801 * arm the timer (we'll just fake it for timer_gettime). 739 * arm the timer (we'll just fake it for timer_gettime).
802 */ 740 */
803 timer->it.cpu.expires = new_expires; 741 timer->it.cpu.expires = new_expires;
804 if (new_expires.sched != 0 && 742 if (new_expires != 0 && val < new_expires) {
805 cpu_time_before(timer->it_clock, val, new_expires)) {
806 arm_timer(timer); 743 arm_timer(timer);
807 } 744 }
808 745
@@ -826,8 +763,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
826 timer->it_overrun_last = 0; 763 timer->it_overrun_last = 0;
827 timer->it_overrun = -1; 764 timer->it_overrun = -1;
828 765
829 if (new_expires.sched != 0 && 766 if (new_expires != 0 && !(val < new_expires)) {
830 !cpu_time_before(timer->it_clock, val, new_expires)) {
831 /* 767 /*
832 * The designated time already passed, so we notify 768 * The designated time already passed, so we notify
833 * immediately, even if the thread never runs to 769 * immediately, even if the thread never runs to
@@ -849,7 +785,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
849 785
850static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) 786static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
851{ 787{
852 union cpu_time_count now; 788 unsigned long long now;
853 struct task_struct *p = timer->it.cpu.task; 789 struct task_struct *p = timer->it.cpu.task;
854 int clear_dead; 790 int clear_dead;
855 791
@@ -859,7 +795,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
859 sample_to_timespec(timer->it_clock, 795 sample_to_timespec(timer->it_clock,
860 timer->it.cpu.incr, &itp->it_interval); 796 timer->it.cpu.incr, &itp->it_interval);
861 797
862 if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */ 798 if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */
863 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; 799 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
864 return; 800 return;
865 } 801 }
@@ -891,7 +827,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
891 */ 827 */
892 put_task_struct(p); 828 put_task_struct(p);
893 timer->it.cpu.task = NULL; 829 timer->it.cpu.task = NULL;
894 timer->it.cpu.expires.sched = 0; 830 timer->it.cpu.expires = 0;
895 read_unlock(&tasklist_lock); 831 read_unlock(&tasklist_lock);
896 goto dead; 832 goto dead;
897 } else { 833 } else {
@@ -912,10 +848,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
912 goto dead; 848 goto dead;
913 } 849 }
914 850
915 if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) { 851 if (now < timer->it.cpu.expires) {
916 sample_to_timespec(timer->it_clock, 852 sample_to_timespec(timer->it_clock,
917 cpu_time_sub(timer->it_clock, 853 timer->it.cpu.expires - now,
918 timer->it.cpu.expires, now),
919 &itp->it_value); 854 &itp->it_value);
920 } else { 855 } else {
921 /* 856 /*
@@ -927,6 +862,28 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
927 } 862 }
928} 863}
929 864
865static unsigned long long
866check_timers_list(struct list_head *timers,
867 struct list_head *firing,
868 unsigned long long curr)
869{
870 int maxfire = 20;
871
872 while (!list_empty(timers)) {
873 struct cpu_timer_list *t;
874
875 t = list_first_entry(timers, struct cpu_timer_list, entry);
876
877 if (!--maxfire || curr < t->expires)
878 return t->expires;
879
880 t->firing = 1;
881 list_move_tail(&t->entry, firing);
882 }
883
884 return 0;
885}
886
930/* 887/*
931 * Check for any per-thread CPU timers that have fired and move them off 888 * Check for any per-thread CPU timers that have fired and move them off
932 * the tsk->cpu_timers[N] list onto the firing list. Here we update the 889 * the tsk->cpu_timers[N] list onto the firing list. Here we update the
@@ -935,54 +892,20 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
935static void check_thread_timers(struct task_struct *tsk, 892static void check_thread_timers(struct task_struct *tsk,
936 struct list_head *firing) 893 struct list_head *firing)
937{ 894{
938 int maxfire;
939 struct list_head *timers = tsk->cpu_timers; 895 struct list_head *timers = tsk->cpu_timers;
940 struct signal_struct *const sig = tsk->signal; 896 struct signal_struct *const sig = tsk->signal;
897 struct task_cputime *tsk_expires = &tsk->cputime_expires;
898 unsigned long long expires;
941 unsigned long soft; 899 unsigned long soft;
942 900
943 maxfire = 20; 901 expires = check_timers_list(timers, firing, prof_ticks(tsk));
944 tsk->cputime_expires.prof_exp = 0; 902 tsk_expires->prof_exp = expires_to_cputime(expires);
945 while (!list_empty(timers)) {
946 struct cpu_timer_list *t = list_first_entry(timers,
947 struct cpu_timer_list,
948 entry);
949 if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
950 tsk->cputime_expires.prof_exp = t->expires.cpu;
951 break;
952 }
953 t->firing = 1;
954 list_move_tail(&t->entry, firing);
955 }
956 903
957 ++timers; 904 expires = check_timers_list(++timers, firing, virt_ticks(tsk));
958 maxfire = 20; 905 tsk_expires->virt_exp = expires_to_cputime(expires);
959 tsk->cputime_expires.virt_exp = 0;
960 while (!list_empty(timers)) {
961 struct cpu_timer_list *t = list_first_entry(timers,
962 struct cpu_timer_list,
963 entry);
964 if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
965 tsk->cputime_expires.virt_exp = t->expires.cpu;
966 break;
967 }
968 t->firing = 1;
969 list_move_tail(&t->entry, firing);
970 }
971 906
972 ++timers; 907 tsk_expires->sched_exp = check_timers_list(++timers, firing,
973 maxfire = 20; 908 tsk->se.sum_exec_runtime);
974 tsk->cputime_expires.sched_exp = 0;
975 while (!list_empty(timers)) {
976 struct cpu_timer_list *t = list_first_entry(timers,
977 struct cpu_timer_list,
978 entry);
979 if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
980 tsk->cputime_expires.sched_exp = t->expires.sched;
981 break;
982 }
983 t->firing = 1;
984 list_move_tail(&t->entry, firing);
985 }
986 909
987 /* 910 /*
988 * Check for the special case thread timers. 911 * Check for the special case thread timers.
@@ -1030,7 +953,8 @@ static void stop_process_timers(struct signal_struct *sig)
1030static u32 onecputick; 953static u32 onecputick;
1031 954
1032static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, 955static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1033 cputime_t *expires, cputime_t cur_time, int signo) 956 unsigned long long *expires,
957 unsigned long long cur_time, int signo)
1034{ 958{
1035 if (!it->expires) 959 if (!it->expires)
1036 return; 960 return;
@@ -1066,9 +990,8 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1066static void check_process_timers(struct task_struct *tsk, 990static void check_process_timers(struct task_struct *tsk,
1067 struct list_head *firing) 991 struct list_head *firing)
1068{ 992{
1069 int maxfire;
1070 struct signal_struct *const sig = tsk->signal; 993 struct signal_struct *const sig = tsk->signal;
1071 cputime_t utime, ptime, virt_expires, prof_expires; 994 unsigned long long utime, ptime, virt_expires, prof_expires;
1072 unsigned long long sum_sched_runtime, sched_expires; 995 unsigned long long sum_sched_runtime, sched_expires;
1073 struct list_head *timers = sig->cpu_timers; 996 struct list_head *timers = sig->cpu_timers;
1074 struct task_cputime cputime; 997 struct task_cputime cputime;
@@ -1078,52 +1001,13 @@ static void check_process_timers(struct task_struct *tsk,
1078 * Collect the current process totals. 1001 * Collect the current process totals.
1079 */ 1002 */
1080 thread_group_cputimer(tsk, &cputime); 1003 thread_group_cputimer(tsk, &cputime);
1081 utime = cputime.utime; 1004 utime = cputime_to_expires(cputime.utime);
1082 ptime = utime + cputime.stime; 1005 ptime = utime + cputime_to_expires(cputime.stime);
1083 sum_sched_runtime = cputime.sum_exec_runtime; 1006 sum_sched_runtime = cputime.sum_exec_runtime;
1084 maxfire = 20;
1085 prof_expires = 0;
1086 while (!list_empty(timers)) {
1087 struct cpu_timer_list *tl = list_first_entry(timers,
1088 struct cpu_timer_list,
1089 entry);
1090 if (!--maxfire || ptime < tl->expires.cpu) {
1091 prof_expires = tl->expires.cpu;
1092 break;
1093 }
1094 tl->firing = 1;
1095 list_move_tail(&tl->entry, firing);
1096 }
1097 1007
1098 ++timers; 1008 prof_expires = check_timers_list(timers, firing, ptime);
1099 maxfire = 20; 1009 virt_expires = check_timers_list(++timers, firing, utime);
1100 virt_expires = 0; 1010 sched_expires = check_timers_list(++timers, firing, sum_sched_runtime);
1101 while (!list_empty(timers)) {
1102 struct cpu_timer_list *tl = list_first_entry(timers,
1103 struct cpu_timer_list,
1104 entry);
1105 if (!--maxfire || utime < tl->expires.cpu) {
1106 virt_expires = tl->expires.cpu;
1107 break;
1108 }
1109 tl->firing = 1;
1110 list_move_tail(&tl->entry, firing);
1111 }
1112
1113 ++timers;
1114 maxfire = 20;
1115 sched_expires = 0;
1116 while (!list_empty(timers)) {
1117 struct cpu_timer_list *tl = list_first_entry(timers,
1118 struct cpu_timer_list,
1119 entry);
1120 if (!--maxfire || sum_sched_runtime < tl->expires.sched) {
1121 sched_expires = tl->expires.sched;
1122 break;
1123 }
1124 tl->firing = 1;
1125 list_move_tail(&tl->entry, firing);
1126 }
1127 1011
1128 /* 1012 /*
1129 * Check for the special case process timers. 1013 * Check for the special case process timers.
@@ -1162,8 +1046,8 @@ static void check_process_timers(struct task_struct *tsk,
1162 } 1046 }
1163 } 1047 }
1164 1048
1165 sig->cputime_expires.prof_exp = prof_expires; 1049 sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires);
1166 sig->cputime_expires.virt_exp = virt_expires; 1050 sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires);
1167 sig->cputime_expires.sched_exp = sched_expires; 1051 sig->cputime_expires.sched_exp = sched_expires;
1168 if (task_cputime_zero(&sig->cputime_expires)) 1052 if (task_cputime_zero(&sig->cputime_expires))
1169 stop_process_timers(sig); 1053 stop_process_timers(sig);
@@ -1176,7 +1060,7 @@ static void check_process_timers(struct task_struct *tsk,
1176void posix_cpu_timer_schedule(struct k_itimer *timer) 1060void posix_cpu_timer_schedule(struct k_itimer *timer)
1177{ 1061{
1178 struct task_struct *p = timer->it.cpu.task; 1062 struct task_struct *p = timer->it.cpu.task;
1179 union cpu_time_count now; 1063 unsigned long long now;
1180 1064
1181 if (unlikely(p == NULL)) 1065 if (unlikely(p == NULL))
1182 /* 1066 /*
@@ -1205,7 +1089,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1205 */ 1089 */
1206 put_task_struct(p); 1090 put_task_struct(p);
1207 timer->it.cpu.task = p = NULL; 1091 timer->it.cpu.task = p = NULL;
1208 timer->it.cpu.expires.sched = 0; 1092 timer->it.cpu.expires = 0;
1209 goto out_unlock; 1093 goto out_unlock;
1210 } else if (unlikely(p->exit_state) && thread_group_empty(p)) { 1094 } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
1211 /* 1095 /*
@@ -1213,6 +1097,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1213 * not yet reaped. Take this opportunity to 1097 * not yet reaped. Take this opportunity to
1214 * drop our task ref. 1098 * drop our task ref.
1215 */ 1099 */
1100 cpu_timer_sample_group(timer->it_clock, p, &now);
1216 clear_dead_task(timer, now); 1101 clear_dead_task(timer, now);
1217 goto out_unlock; 1102 goto out_unlock;
1218 } 1103 }
@@ -1387,7 +1272,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1387void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, 1272void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1388 cputime_t *newval, cputime_t *oldval) 1273 cputime_t *newval, cputime_t *oldval)
1389{ 1274{
1390 union cpu_time_count now; 1275 unsigned long long now;
1391 1276
1392 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1277 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1393 cpu_timer_sample_group(clock_idx, tsk, &now); 1278 cpu_timer_sample_group(clock_idx, tsk, &now);
@@ -1399,17 +1284,17 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1399 * it to be absolute. 1284 * it to be absolute.
1400 */ 1285 */
1401 if (*oldval) { 1286 if (*oldval) {
1402 if (*oldval <= now.cpu) { 1287 if (*oldval <= now) {
1403 /* Just about to fire. */ 1288 /* Just about to fire. */
1404 *oldval = cputime_one_jiffy; 1289 *oldval = cputime_one_jiffy;
1405 } else { 1290 } else {
1406 *oldval -= now.cpu; 1291 *oldval -= now;
1407 } 1292 }
1408 } 1293 }
1409 1294
1410 if (!*newval) 1295 if (!*newval)
1411 goto out; 1296 goto out;
1412 *newval += now.cpu; 1297 *newval += now;
1413 } 1298 }
1414 1299
1415 /* 1300 /*
@@ -1459,7 +1344,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1459 } 1344 }
1460 1345
1461 while (!signal_pending(current)) { 1346 while (!signal_pending(current)) {
1462 if (timer.it.cpu.expires.sched == 0) { 1347 if (timer.it.cpu.expires == 0) {
1463 /* 1348 /*
1464 * Our timer fired and was reset, below 1349 * Our timer fired and was reset, below
1465 * deletion can not fail. 1350 * deletion can not fail.
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 5dfdc9ea180b..d444c4e834f4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -100,7 +100,6 @@ config PM_SLEEP_SMP
100 depends on SMP 100 depends on SMP
101 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE 101 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
102 depends on PM_SLEEP 102 depends on PM_SLEEP
103 select HOTPLUG
104 select HOTPLUG_CPU 103 select HOTPLUG_CPU
105 104
106config PM_AUTOSLEEP 105config PM_AUTOSLEEP
@@ -263,6 +262,26 @@ config PM_GENERIC_DOMAINS
263 bool 262 bool
264 depends on PM 263 depends on PM
265 264
265config WQ_POWER_EFFICIENT_DEFAULT
266 bool "Enable workqueue power-efficient mode by default"
267 depends on PM
268 default n
269 help
270 Per-cpu workqueues are generally preferred because they show
271 better performance thanks to cache locality; unfortunately,
272 per-cpu workqueues tend to be more power hungry than unbound
273 workqueues.
274
275 Enabling workqueue.power_efficient kernel parameter makes the
276 per-cpu workqueues which were observed to contribute
277 significantly to power consumption unbound, leading to measurably
278 lower power usage at the cost of small performance overhead.
279
280 This config option determines whether workqueue.power_efficient
281 is enabled by default.
282
283 If in doubt, say N.
284
266config PM_GENERIC_DOMAINS_SLEEP 285config PM_GENERIC_DOMAINS_SLEEP
267 def_bool y 286 def_bool y
268 depends on PM_SLEEP && PM_GENERIC_DOMAINS 287 depends on PM_SLEEP && PM_GENERIC_DOMAINS
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index c6422ffeda9a..9012ecf7b814 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -32,7 +32,8 @@ static void try_to_suspend(struct work_struct *work)
32 32
33 mutex_lock(&autosleep_lock); 33 mutex_lock(&autosleep_lock);
34 34
35 if (!pm_save_wakeup_count(initial_count)) { 35 if (!pm_save_wakeup_count(initial_count) ||
36 system_state != SYSTEM_RUNNING) {
36 mutex_unlock(&autosleep_lock); 37 mutex_unlock(&autosleep_lock);
37 goto out; 38 goto out;
38 } 39 }
diff --git a/kernel/power/main.c b/kernel/power/main.c
index d77663bfedeb..1d1bf630e6e9 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -424,6 +424,8 @@ static ssize_t wakeup_count_store(struct kobject *kobj,
424 if (sscanf(buf, "%u", &val) == 1) { 424 if (sscanf(buf, "%u", &val) == 1) {
425 if (pm_save_wakeup_count(val)) 425 if (pm_save_wakeup_count(val))
426 error = n; 426 error = n;
427 else
428 pm_print_active_wakeup_sources();
427 } 429 }
428 430
429 out: 431 out:
@@ -528,6 +530,10 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
528 530
529 if (sscanf(buf, "%d", &val) == 1) { 531 if (sscanf(buf, "%d", &val) == 1) {
530 pm_trace_enabled = !!val; 532 pm_trace_enabled = !!val;
533 if (pm_trace_enabled) {
534 pr_warn("PM: Enabling pm_trace changes system date and time during resume.\n"
535 "PM: Correct system time has to be restored manually after resume.\n");
536 }
531 return n; 537 return n;
532 } 538 }
533 return -EINVAL; 539 return -EINVAL;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 98088e0e71e8..06ec8869dbf1 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -30,9 +30,10 @@ static int try_to_freeze_tasks(bool user_only)
30 unsigned int todo; 30 unsigned int todo;
31 bool wq_busy = false; 31 bool wq_busy = false;
32 struct timeval start, end; 32 struct timeval start, end;
33 u64 elapsed_csecs64; 33 u64 elapsed_msecs64;
34 unsigned int elapsed_csecs; 34 unsigned int elapsed_msecs;
35 bool wakeup = false; 35 bool wakeup = false;
36 int sleep_usecs = USEC_PER_MSEC;
36 37
37 do_gettimeofday(&start); 38 do_gettimeofday(&start);
38 39
@@ -68,22 +69,25 @@ static int try_to_freeze_tasks(bool user_only)
68 69
69 /* 70 /*
70 * We need to retry, but first give the freezing tasks some 71 * We need to retry, but first give the freezing tasks some
71 * time to enter the refrigerator. 72 * time to enter the refrigerator. Start with an initial
73 * 1 ms sleep followed by exponential backoff until 8 ms.
72 */ 74 */
73 msleep(10); 75 usleep_range(sleep_usecs / 2, sleep_usecs);
76 if (sleep_usecs < 8 * USEC_PER_MSEC)
77 sleep_usecs *= 2;
74 } 78 }
75 79
76 do_gettimeofday(&end); 80 do_gettimeofday(&end);
77 elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); 81 elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
78 do_div(elapsed_csecs64, NSEC_PER_SEC / 100); 82 do_div(elapsed_msecs64, NSEC_PER_MSEC);
79 elapsed_csecs = elapsed_csecs64; 83 elapsed_msecs = elapsed_msecs64;
80 84
81 if (todo) { 85 if (todo) {
82 printk("\n"); 86 printk("\n");
83 printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " 87 printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds "
84 "(%d tasks refusing to freeze, wq_busy=%d):\n", 88 "(%d tasks refusing to freeze, wq_busy=%d):\n",
85 wakeup ? "aborted" : "failed", 89 wakeup ? "aborted" : "failed",
86 elapsed_csecs / 100, elapsed_csecs % 100, 90 elapsed_msecs / 1000, elapsed_msecs % 1000,
87 todo - wq_busy, wq_busy); 91 todo - wq_busy, wq_busy);
88 92
89 if (!wakeup) { 93 if (!wakeup) {
@@ -96,8 +100,8 @@ static int try_to_freeze_tasks(bool user_only)
96 read_unlock(&tasklist_lock); 100 read_unlock(&tasklist_lock);
97 } 101 }
98 } else { 102 } else {
99 printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, 103 printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
100 elapsed_csecs % 100); 104 elapsed_msecs % 1000);
101 } 105 }
102 106
103 return todo ? -EBUSY : 0; 107 return todo ? -EBUSY : 0;
@@ -105,6 +109,8 @@ static int try_to_freeze_tasks(bool user_only)
105 109
106/** 110/**
107 * freeze_processes - Signal user space processes to enter the refrigerator. 111 * freeze_processes - Signal user space processes to enter the refrigerator.
112 * The current thread will not be frozen. The same process that calls
113 * freeze_processes must later call thaw_processes.
108 * 114 *
109 * On success, returns 0. On failure, -errno and system is fully thawed. 115 * On success, returns 0. On failure, -errno and system is fully thawed.
110 */ 116 */
@@ -116,6 +122,9 @@ int freeze_processes(void)
116 if (error) 122 if (error)
117 return error; 123 return error;
118 124
125 /* Make sure this task doesn't get frozen */
126 current->flags |= PF_SUSPEND_TASK;
127
119 if (!pm_freezing) 128 if (!pm_freezing)
120 atomic_inc(&system_freezing_cnt); 129 atomic_inc(&system_freezing_cnt);
121 130
@@ -164,6 +173,7 @@ int freeze_kernel_threads(void)
164void thaw_processes(void) 173void thaw_processes(void)
165{ 174{
166 struct task_struct *g, *p; 175 struct task_struct *g, *p;
176 struct task_struct *curr = current;
167 177
168 if (pm_freezing) 178 if (pm_freezing)
169 atomic_dec(&system_freezing_cnt); 179 atomic_dec(&system_freezing_cnt);
@@ -178,10 +188,15 @@ void thaw_processes(void)
178 188
179 read_lock(&tasklist_lock); 189 read_lock(&tasklist_lock);
180 do_each_thread(g, p) { 190 do_each_thread(g, p) {
191 /* No other threads should have PF_SUSPEND_TASK set */
192 WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK));
181 __thaw_task(p); 193 __thaw_task(p);
182 } while_each_thread(g, p); 194 } while_each_thread(g, p);
183 read_unlock(&tasklist_lock); 195 read_unlock(&tasklist_lock);
184 196
197 WARN_ON(!(curr->flags & PF_SUSPEND_TASK));
198 curr->flags &= ~PF_SUSPEND_TASK;
199
185 usermodehelper_enable(); 200 usermodehelper_enable();
186 201
187 schedule(); 202 schedule();
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 587dddeebf15..06fe28589e9c 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -44,6 +44,7 @@
44 44
45#include <linux/uaccess.h> 45#include <linux/uaccess.h>
46#include <linux/export.h> 46#include <linux/export.h>
47#include <trace/events/power.h>
47 48
48/* 49/*
49 * locking rule: all changes to constraints or notifiers lists 50 * locking rule: all changes to constraints or notifiers lists
@@ -202,6 +203,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
202 203
203 spin_unlock_irqrestore(&pm_qos_lock, flags); 204 spin_unlock_irqrestore(&pm_qos_lock, flags);
204 205
206 trace_pm_qos_update_target(action, prev_value, curr_value);
205 if (prev_value != curr_value) { 207 if (prev_value != curr_value) {
206 blocking_notifier_call_chain(c->notifiers, 208 blocking_notifier_call_chain(c->notifiers,
207 (unsigned long)curr_value, 209 (unsigned long)curr_value,
@@ -272,6 +274,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf,
272 274
273 spin_unlock_irqrestore(&pm_qos_lock, irqflags); 275 spin_unlock_irqrestore(&pm_qos_lock, irqflags);
274 276
277 trace_pm_qos_update_flags(action, prev_value, curr_value);
275 return prev_value != curr_value; 278 return prev_value != curr_value;
276} 279}
277 280
@@ -333,6 +336,7 @@ void pm_qos_add_request(struct pm_qos_request *req,
333 } 336 }
334 req->pm_qos_class = pm_qos_class; 337 req->pm_qos_class = pm_qos_class;
335 INIT_DELAYED_WORK(&req->work, pm_qos_work_fn); 338 INIT_DELAYED_WORK(&req->work, pm_qos_work_fn);
339 trace_pm_qos_add_request(pm_qos_class, value);
336 pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, 340 pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
337 &req->node, PM_QOS_ADD_REQ, value); 341 &req->node, PM_QOS_ADD_REQ, value);
338} 342}
@@ -361,6 +365,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
361 365
362 cancel_delayed_work_sync(&req->work); 366 cancel_delayed_work_sync(&req->work);
363 367
368 trace_pm_qos_update_request(req->pm_qos_class, new_value);
364 if (new_value != req->node.prio) 369 if (new_value != req->node.prio)
365 pm_qos_update_target( 370 pm_qos_update_target(
366 pm_qos_array[req->pm_qos_class]->constraints, 371 pm_qos_array[req->pm_qos_class]->constraints,
@@ -387,6 +392,8 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
387 392
388 cancel_delayed_work_sync(&req->work); 393 cancel_delayed_work_sync(&req->work);
389 394
395 trace_pm_qos_update_request_timeout(req->pm_qos_class,
396 new_value, timeout_us);
390 if (new_value != req->node.prio) 397 if (new_value != req->node.prio)
391 pm_qos_update_target( 398 pm_qos_update_target(
392 pm_qos_array[req->pm_qos_class]->constraints, 399 pm_qos_array[req->pm_qos_class]->constraints,
@@ -416,6 +423,7 @@ void pm_qos_remove_request(struct pm_qos_request *req)
416 423
417 cancel_delayed_work_sync(&req->work); 424 cancel_delayed_work_sync(&req->work);
418 425
426 trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE);
419 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, 427 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
420 &req->node, PM_QOS_REMOVE_REQ, 428 &req->node, PM_QOS_REMOVE_REQ,
421 PM_QOS_DEFAULT_VALUE); 429 PM_QOS_DEFAULT_VALUE);
@@ -477,7 +485,7 @@ static int find_pm_qos_object_by_minor(int minor)
477{ 485{
478 int pm_qos_class; 486 int pm_qos_class;
479 487
480 for (pm_qos_class = 0; 488 for (pm_qos_class = PM_QOS_CPU_DMA_LATENCY;
481 pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { 489 pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
482 if (minor == 490 if (minor ==
483 pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) 491 pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
@@ -491,7 +499,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
491 long pm_qos_class; 499 long pm_qos_class;
492 500
493 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 501 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
494 if (pm_qos_class >= 0) { 502 if (pm_qos_class >= PM_QOS_CPU_DMA_LATENCY) {
495 struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL); 503 struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL);
496 if (!req) 504 if (!req)
497 return -ENOMEM; 505 return -ENOMEM;
@@ -584,7 +592,7 @@ static int __init pm_qos_power_init(void)
584 592
585 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); 593 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
586 594
587 for (i = 1; i < PM_QOS_NUM_CLASSES; i++) { 595 for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
588 ret = register_pm_qos_misc(pm_qos_array[i]); 596 ret = register_pm_qos_misc(pm_qos_array[i]);
589 if (ret < 0) { 597 if (ret < 0) {
590 printk(KERN_ERR "pm_qos_param: %s setup failed\n", 598 printk(KERN_ERR "pm_qos_param: %s setup failed\n",
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0de28576807d..349587bb03e1 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -642,8 +642,9 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
642 region->end_pfn = end_pfn; 642 region->end_pfn = end_pfn;
643 list_add_tail(&region->list, &nosave_regions); 643 list_add_tail(&region->list, &nosave_regions);
644 Report: 644 Report:
645 printk(KERN_INFO "PM: Registered nosave memory: %016lx - %016lx\n", 645 printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n",
646 start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); 646 (unsigned long long) start_pfn << PAGE_SHIFT,
647 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
647} 648}
648 649
649/* 650/*
@@ -1651,7 +1652,7 @@ unsigned long snapshot_get_image_size(void)
1651static int init_header(struct swsusp_info *info) 1652static int init_header(struct swsusp_info *info)
1652{ 1653{
1653 memset(info, 0, sizeof(struct swsusp_info)); 1654 memset(info, 0, sizeof(struct swsusp_info));
1654 info->num_physpages = num_physpages; 1655 info->num_physpages = get_num_physpages();
1655 info->image_pages = nr_copy_pages; 1656 info->image_pages = nr_copy_pages;
1656 info->pages = snapshot_get_image_size(); 1657 info->pages = snapshot_get_image_size();
1657 info->size = info->pages; 1658 info->size = info->pages;
@@ -1795,7 +1796,7 @@ static int check_header(struct swsusp_info *info)
1795 char *reason; 1796 char *reason;
1796 1797
1797 reason = check_image_kernel(info); 1798 reason = check_image_kernel(info);
1798 if (!reason && info->num_physpages != num_physpages) 1799 if (!reason && info->num_physpages != get_num_physpages())
1799 reason = "memory size"; 1800 reason = "memory size";
1800 if (reason) { 1801 if (reason) {
1801 printk(KERN_ERR "PM: Image mismatch: %s\n", reason); 1802 printk(KERN_ERR "PM: Image mismatch: %s\n", reason);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index bef86d121eb2..ece04223bb1e 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -269,7 +269,7 @@ int suspend_devices_and_enter(suspend_state_t state)
269 suspend_test_start(); 269 suspend_test_start();
270 error = dpm_suspend_start(PMSG_SUSPEND); 270 error = dpm_suspend_start(PMSG_SUSPEND);
271 if (error) { 271 if (error) {
272 printk(KERN_ERR "PM: Some devices failed to suspend\n"); 272 pr_err("PM: Some devices failed to suspend, or early wake event detected\n");
273 goto Recover_platform; 273 goto Recover_platform;
274 } 274 }
275 suspend_test_finish("suspend devices"); 275 suspend_test_finish("suspend devices");
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
new file mode 100644
index 000000000000..85405bdcf2b3
--- /dev/null
+++ b/kernel/printk/Makefile
@@ -0,0 +1,2 @@
1obj-y = printk.o
2obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
new file mode 100644
index 000000000000..276762f3a460
--- /dev/null
+++ b/kernel/printk/braille.c
@@ -0,0 +1,49 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
3#include <linux/kernel.h>
4#include <linux/console.h>
5#include <linux/string.h>
6
7#include "console_cmdline.h"
8#include "braille.h"
9
10char *_braille_console_setup(char **str, char **brl_options)
11{
12 if (!memcmp(*str, "brl,", 4)) {
13 *brl_options = "";
14 *str += 4;
15 } else if (!memcmp(str, "brl=", 4)) {
16 *brl_options = *str + 4;
17 *str = strchr(*brl_options, ',');
18 if (!*str)
19 pr_err("need port name after brl=\n");
20 else
21 *((*str)++) = 0;
22 } else
23 return NULL;
24
25 return *str;
26}
27
28int
29_braille_register_console(struct console *console, struct console_cmdline *c)
30{
31 int rtn = 0;
32
33 if (c->brl_options) {
34 console->flags |= CON_BRL;
35 rtn = braille_register_console(console, c->index, c->options,
36 c->brl_options);
37 }
38
39 return rtn;
40}
41
42int
43_braille_unregister_console(struct console *console)
44{
45 if (console->flags & CON_BRL)
46 return braille_unregister_console(console);
47
48 return 0;
49}
diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h
new file mode 100644
index 000000000000..769d771145c8
--- /dev/null
+++ b/kernel/printk/braille.h
@@ -0,0 +1,48 @@
1#ifndef _PRINTK_BRAILLE_H
2#define _PRINTK_BRAILLE_H
3
4#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
5
6static inline void
7braille_set_options(struct console_cmdline *c, char *brl_options)
8{
9 c->brl_options = brl_options;
10}
11
12char *
13_braille_console_setup(char **str, char **brl_options);
14
15int
16_braille_register_console(struct console *console, struct console_cmdline *c);
17
18int
19_braille_unregister_console(struct console *console);
20
21#else
22
23static inline void
24braille_set_options(struct console_cmdline *c, char *brl_options)
25{
26}
27
28static inline char *
29_braille_console_setup(char **str, char **brl_options)
30{
31 return NULL;
32}
33
34static inline int
35_braille_register_console(struct console *console, struct console_cmdline *c)
36{
37 return 0;
38}
39
40static inline int
41_braille_unregister_console(struct console *console)
42{
43 return 0;
44}
45
46#endif
47
48#endif
diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h
new file mode 100644
index 000000000000..cbd69d842341
--- /dev/null
+++ b/kernel/printk/console_cmdline.h
@@ -0,0 +1,14 @@
1#ifndef _CONSOLE_CMDLINE_H
2#define _CONSOLE_CMDLINE_H
3
4struct console_cmdline
5{
6 char name[8]; /* Name of the driver */
7 int index; /* Minor dev. to use */
8 char *options; /* Options for the driver */
9#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
10 char *brl_options; /* Options for braille driver */
11#endif
12};
13
14#endif
diff --git a/kernel/printk.c b/kernel/printk/printk.c
index 8212c1aef125..5b5a7080e2a5 100644
--- a/kernel/printk.c
+++ b/kernel/printk/printk.c
@@ -51,6 +51,9 @@
51#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
52#include <trace/events/printk.h> 52#include <trace/events/printk.h>
53 53
54#include "console_cmdline.h"
55#include "braille.h"
56
54/* printk's without a loglevel use this.. */ 57/* printk's without a loglevel use this.. */
55#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL 58#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
56 59
@@ -105,19 +108,11 @@ static struct console *exclusive_console;
105/* 108/*
106 * Array of consoles built from command line options (console=) 109 * Array of consoles built from command line options (console=)
107 */ 110 */
108struct console_cmdline
109{
110 char name[8]; /* Name of the driver */
111 int index; /* Minor dev. to use */
112 char *options; /* Options for the driver */
113#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
114 char *brl_options; /* Options for braille driver */
115#endif
116};
117 111
118#define MAX_CMDLINECONSOLES 8 112#define MAX_CMDLINECONSOLES 8
119 113
120static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; 114static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
115
121static int selected_console = -1; 116static int selected_console = -1;
122static int preferred_console = -1; 117static int preferred_console = -1;
123int console_set_on_cmdline; 118int console_set_on_cmdline;
@@ -178,7 +173,7 @@ static int console_may_schedule;
178 * 67 "g" 173 * 67 "g"
179 * 0032 00 00 00 padding to next message header 174 * 0032 00 00 00 padding to next message header
180 * 175 *
181 * The 'struct log' buffer header must never be directly exported to 176 * The 'struct printk_log' buffer header must never be directly exported to
182 * userspace, it is a kernel-private implementation detail that might 177 * userspace, it is a kernel-private implementation detail that might
183 * need to be changed in the future, when the requirements change. 178 * need to be changed in the future, when the requirements change.
184 * 179 *
@@ -200,7 +195,7 @@ enum log_flags {
200 LOG_CONT = 8, /* text is a fragment of a continuation line */ 195 LOG_CONT = 8, /* text is a fragment of a continuation line */
201}; 196};
202 197
203struct log { 198struct printk_log {
204 u64 ts_nsec; /* timestamp in nanoseconds */ 199 u64 ts_nsec; /* timestamp in nanoseconds */
205 u16 len; /* length of entire record */ 200 u16 len; /* length of entire record */
206 u16 text_len; /* length of text buffer */ 201 u16 text_len; /* length of text buffer */
@@ -248,7 +243,7 @@ static u32 clear_idx;
248#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) 243#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
249#define LOG_ALIGN 4 244#define LOG_ALIGN 4
250#else 245#else
251#define LOG_ALIGN __alignof__(struct log) 246#define LOG_ALIGN __alignof__(struct printk_log)
252#endif 247#endif
253#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) 248#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
254static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); 249static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
@@ -259,35 +254,35 @@ static u32 log_buf_len = __LOG_BUF_LEN;
259static volatile unsigned int logbuf_cpu = UINT_MAX; 254static volatile unsigned int logbuf_cpu = UINT_MAX;
260 255
261/* human readable text of the record */ 256/* human readable text of the record */
262static char *log_text(const struct log *msg) 257static char *log_text(const struct printk_log *msg)
263{ 258{
264 return (char *)msg + sizeof(struct log); 259 return (char *)msg + sizeof(struct printk_log);
265} 260}
266 261
267/* optional key/value pair dictionary attached to the record */ 262/* optional key/value pair dictionary attached to the record */
268static char *log_dict(const struct log *msg) 263static char *log_dict(const struct printk_log *msg)
269{ 264{
270 return (char *)msg + sizeof(struct log) + msg->text_len; 265 return (char *)msg + sizeof(struct printk_log) + msg->text_len;
271} 266}
272 267
273/* get record by index; idx must point to valid msg */ 268/* get record by index; idx must point to valid msg */
274static struct log *log_from_idx(u32 idx) 269static struct printk_log *log_from_idx(u32 idx)
275{ 270{
276 struct log *msg = (struct log *)(log_buf + idx); 271 struct printk_log *msg = (struct printk_log *)(log_buf + idx);
277 272
278 /* 273 /*
279 * A length == 0 record is the end of buffer marker. Wrap around and 274 * A length == 0 record is the end of buffer marker. Wrap around and
280 * read the message at the start of the buffer. 275 * read the message at the start of the buffer.
281 */ 276 */
282 if (!msg->len) 277 if (!msg->len)
283 return (struct log *)log_buf; 278 return (struct printk_log *)log_buf;
284 return msg; 279 return msg;
285} 280}
286 281
287/* get next record; idx must point to valid msg */ 282/* get next record; idx must point to valid msg */
288static u32 log_next(u32 idx) 283static u32 log_next(u32 idx)
289{ 284{
290 struct log *msg = (struct log *)(log_buf + idx); 285 struct printk_log *msg = (struct printk_log *)(log_buf + idx);
291 286
292 /* length == 0 indicates the end of the buffer; wrap */ 287 /* length == 0 indicates the end of the buffer; wrap */
293 /* 288 /*
@@ -296,7 +291,7 @@ static u32 log_next(u32 idx)
296 * return the one after that. 291 * return the one after that.
297 */ 292 */
298 if (!msg->len) { 293 if (!msg->len) {
299 msg = (struct log *)log_buf; 294 msg = (struct printk_log *)log_buf;
300 return msg->len; 295 return msg->len;
301 } 296 }
302 return idx + msg->len; 297 return idx + msg->len;
@@ -308,11 +303,11 @@ static void log_store(int facility, int level,
308 const char *dict, u16 dict_len, 303 const char *dict, u16 dict_len,
309 const char *text, u16 text_len) 304 const char *text, u16 text_len)
310{ 305{
311 struct log *msg; 306 struct printk_log *msg;
312 u32 size, pad_len; 307 u32 size, pad_len;
313 308
314 /* number of '\0' padding bytes to next message */ 309 /* number of '\0' padding bytes to next message */
315 size = sizeof(struct log) + text_len + dict_len; 310 size = sizeof(struct printk_log) + text_len + dict_len;
316 pad_len = (-size) & (LOG_ALIGN - 1); 311 pad_len = (-size) & (LOG_ALIGN - 1);
317 size += pad_len; 312 size += pad_len;
318 313
@@ -324,7 +319,7 @@ static void log_store(int facility, int level,
324 else 319 else
325 free = log_first_idx - log_next_idx; 320 free = log_first_idx - log_next_idx;
326 321
327 if (free > size + sizeof(struct log)) 322 if (free > size + sizeof(struct printk_log))
328 break; 323 break;
329 324
330 /* drop old messages until we have enough contiuous space */ 325 /* drop old messages until we have enough contiuous space */
@@ -332,18 +327,18 @@ static void log_store(int facility, int level,
332 log_first_seq++; 327 log_first_seq++;
333 } 328 }
334 329
335 if (log_next_idx + size + sizeof(struct log) >= log_buf_len) { 330 if (log_next_idx + size + sizeof(struct printk_log) >= log_buf_len) {
336 /* 331 /*
337 * This message + an additional empty header does not fit 332 * This message + an additional empty header does not fit
338 * at the end of the buffer. Add an empty header with len == 0 333 * at the end of the buffer. Add an empty header with len == 0
339 * to signify a wrap around. 334 * to signify a wrap around.
340 */ 335 */
341 memset(log_buf + log_next_idx, 0, sizeof(struct log)); 336 memset(log_buf + log_next_idx, 0, sizeof(struct printk_log));
342 log_next_idx = 0; 337 log_next_idx = 0;
343 } 338 }
344 339
345 /* fill message */ 340 /* fill message */
346 msg = (struct log *)(log_buf + log_next_idx); 341 msg = (struct printk_log *)(log_buf + log_next_idx);
347 memcpy(log_text(msg), text, text_len); 342 memcpy(log_text(msg), text, text_len);
348 msg->text_len = text_len; 343 msg->text_len = text_len;
349 memcpy(log_dict(msg), dict, dict_len); 344 memcpy(log_dict(msg), dict, dict_len);
@@ -356,7 +351,7 @@ static void log_store(int facility, int level,
356 else 351 else
357 msg->ts_nsec = local_clock(); 352 msg->ts_nsec = local_clock();
358 memset(log_dict(msg) + dict_len, 0, pad_len); 353 memset(log_dict(msg) + dict_len, 0, pad_len);
359 msg->len = sizeof(struct log) + text_len + dict_len + pad_len; 354 msg->len = sizeof(struct printk_log) + text_len + dict_len + pad_len;
360 355
361 /* insert message */ 356 /* insert message */
362 log_next_idx += msg->len; 357 log_next_idx += msg->len;
@@ -479,7 +474,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
479 size_t count, loff_t *ppos) 474 size_t count, loff_t *ppos)
480{ 475{
481 struct devkmsg_user *user = file->private_data; 476 struct devkmsg_user *user = file->private_data;
482 struct log *msg; 477 struct printk_log *msg;
483 u64 ts_usec; 478 u64 ts_usec;
484 size_t i; 479 size_t i;
485 char cont = '-'; 480 char cont = '-';
@@ -724,14 +719,14 @@ void log_buf_kexec_setup(void)
724 VMCOREINFO_SYMBOL(log_first_idx); 719 VMCOREINFO_SYMBOL(log_first_idx);
725 VMCOREINFO_SYMBOL(log_next_idx); 720 VMCOREINFO_SYMBOL(log_next_idx);
726 /* 721 /*
727 * Export struct log size and field offsets. User space tools can 722 * Export struct printk_log size and field offsets. User space tools can
728 * parse it and detect any changes to structure down the line. 723 * parse it and detect any changes to structure down the line.
729 */ 724 */
730 VMCOREINFO_STRUCT_SIZE(log); 725 VMCOREINFO_STRUCT_SIZE(printk_log);
731 VMCOREINFO_OFFSET(log, ts_nsec); 726 VMCOREINFO_OFFSET(printk_log, ts_nsec);
732 VMCOREINFO_OFFSET(log, len); 727 VMCOREINFO_OFFSET(printk_log, len);
733 VMCOREINFO_OFFSET(log, text_len); 728 VMCOREINFO_OFFSET(printk_log, text_len);
734 VMCOREINFO_OFFSET(log, dict_len); 729 VMCOREINFO_OFFSET(printk_log, dict_len);
735} 730}
736#endif 731#endif
737 732
@@ -884,7 +879,7 @@ static size_t print_time(u64 ts, char *buf)
884 (unsigned long)ts, rem_nsec / 1000); 879 (unsigned long)ts, rem_nsec / 1000);
885} 880}
886 881
887static size_t print_prefix(const struct log *msg, bool syslog, char *buf) 882static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf)
888{ 883{
889 size_t len = 0; 884 size_t len = 0;
890 unsigned int prefix = (msg->facility << 3) | msg->level; 885 unsigned int prefix = (msg->facility << 3) | msg->level;
@@ -907,7 +902,7 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
907 return len; 902 return len;
908} 903}
909 904
910static size_t msg_print_text(const struct log *msg, enum log_flags prev, 905static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
911 bool syslog, char *buf, size_t size) 906 bool syslog, char *buf, size_t size)
912{ 907{
913 const char *text = log_text(msg); 908 const char *text = log_text(msg);
@@ -969,7 +964,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev,
969static int syslog_print(char __user *buf, int size) 964static int syslog_print(char __user *buf, int size)
970{ 965{
971 char *text; 966 char *text;
972 struct log *msg; 967 struct printk_log *msg;
973 int len = 0; 968 int len = 0;
974 969
975 text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); 970 text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
@@ -1060,7 +1055,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1060 idx = clear_idx; 1055 idx = clear_idx;
1061 prev = 0; 1056 prev = 0;
1062 while (seq < log_next_seq) { 1057 while (seq < log_next_seq) {
1063 struct log *msg = log_from_idx(idx); 1058 struct printk_log *msg = log_from_idx(idx);
1064 1059
1065 len += msg_print_text(msg, prev, true, NULL, 0); 1060 len += msg_print_text(msg, prev, true, NULL, 0);
1066 prev = msg->flags; 1061 prev = msg->flags;
@@ -1073,7 +1068,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1073 idx = clear_idx; 1068 idx = clear_idx;
1074 prev = 0; 1069 prev = 0;
1075 while (len > size && seq < log_next_seq) { 1070 while (len > size && seq < log_next_seq) {
1076 struct log *msg = log_from_idx(idx); 1071 struct printk_log *msg = log_from_idx(idx);
1077 1072
1078 len -= msg_print_text(msg, prev, true, NULL, 0); 1073 len -= msg_print_text(msg, prev, true, NULL, 0);
1079 prev = msg->flags; 1074 prev = msg->flags;
@@ -1087,7 +1082,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1087 len = 0; 1082 len = 0;
1088 prev = 0; 1083 prev = 0;
1089 while (len >= 0 && seq < next_seq) { 1084 while (len >= 0 && seq < next_seq) {
1090 struct log *msg = log_from_idx(idx); 1085 struct printk_log *msg = log_from_idx(idx);
1091 int textlen; 1086 int textlen;
1092 1087
1093 textlen = msg_print_text(msg, prev, true, text, 1088 textlen = msg_print_text(msg, prev, true, text,
@@ -1233,7 +1228,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1233 1228
1234 error = 0; 1229 error = 0;
1235 while (seq < log_next_seq) { 1230 while (seq < log_next_seq) {
1236 struct log *msg = log_from_idx(idx); 1231 struct printk_log *msg = log_from_idx(idx);
1237 1232
1238 error += msg_print_text(msg, prev, true, NULL, 0); 1233 error += msg_print_text(msg, prev, true, NULL, 0);
1239 idx = log_next(idx); 1234 idx = log_next(idx);
@@ -1369,9 +1364,9 @@ static int console_trylock_for_printk(unsigned int cpu)
1369 } 1364 }
1370 } 1365 }
1371 logbuf_cpu = UINT_MAX; 1366 logbuf_cpu = UINT_MAX;
1367 raw_spin_unlock(&logbuf_lock);
1372 if (wake) 1368 if (wake)
1373 up(&console_sem); 1369 up(&console_sem);
1374 raw_spin_unlock(&logbuf_lock);
1375 return retval; 1370 return retval;
1376} 1371}
1377 1372
@@ -1719,10 +1714,10 @@ static struct cont {
1719 u8 level; 1714 u8 level;
1720 bool flushed:1; 1715 bool flushed:1;
1721} cont; 1716} cont;
1722static struct log *log_from_idx(u32 idx) { return NULL; } 1717static struct printk_log *log_from_idx(u32 idx) { return NULL; }
1723static u32 log_next(u32 idx) { return 0; } 1718static u32 log_next(u32 idx) { return 0; }
1724static void call_console_drivers(int level, const char *text, size_t len) {} 1719static void call_console_drivers(int level, const char *text, size_t len) {}
1725static size_t msg_print_text(const struct log *msg, enum log_flags prev, 1720static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
1726 bool syslog, char *buf, size_t size) { return 0; } 1721 bool syslog, char *buf, size_t size) { return 0; }
1727static size_t cont_print_text(char *text, size_t size) { return 0; } 1722static size_t cont_print_text(char *text, size_t size) { return 0; }
1728 1723
@@ -1761,23 +1756,23 @@ static int __add_preferred_console(char *name, int idx, char *options,
1761 * See if this tty is not yet registered, and 1756 * See if this tty is not yet registered, and
1762 * if we have a slot free. 1757 * if we have a slot free.
1763 */ 1758 */
1764 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) 1759 for (i = 0, c = console_cmdline;
1765 if (strcmp(console_cmdline[i].name, name) == 0 && 1760 i < MAX_CMDLINECONSOLES && c->name[0];
1766 console_cmdline[i].index == idx) { 1761 i++, c++) {
1767 if (!brl_options) 1762 if (strcmp(c->name, name) == 0 && c->index == idx) {
1768 selected_console = i; 1763 if (!brl_options)
1769 return 0; 1764 selected_console = i;
1765 return 0;
1770 } 1766 }
1767 }
1771 if (i == MAX_CMDLINECONSOLES) 1768 if (i == MAX_CMDLINECONSOLES)
1772 return -E2BIG; 1769 return -E2BIG;
1773 if (!brl_options) 1770 if (!brl_options)
1774 selected_console = i; 1771 selected_console = i;
1775 c = &console_cmdline[i];
1776 strlcpy(c->name, name, sizeof(c->name)); 1772 strlcpy(c->name, name, sizeof(c->name));
1777 c->options = options; 1773 c->options = options;
1778#ifdef CONFIG_A11Y_BRAILLE_CONSOLE 1774 braille_set_options(c, brl_options);
1779 c->brl_options = brl_options; 1775
1780#endif
1781 c->index = idx; 1776 c->index = idx;
1782 return 0; 1777 return 0;
1783} 1778}
@@ -1790,20 +1785,8 @@ static int __init console_setup(char *str)
1790 char *s, *options, *brl_options = NULL; 1785 char *s, *options, *brl_options = NULL;
1791 int idx; 1786 int idx;
1792 1787
1793#ifdef CONFIG_A11Y_BRAILLE_CONSOLE 1788 if (_braille_console_setup(&str, &brl_options))
1794 if (!memcmp(str, "brl,", 4)) { 1789 return 1;
1795 brl_options = "";
1796 str += 4;
1797 } else if (!memcmp(str, "brl=", 4)) {
1798 brl_options = str + 4;
1799 str = strchr(brl_options, ',');
1800 if (!str) {
1801 printk(KERN_ERR "need port name after brl=\n");
1802 return 1;
1803 }
1804 *(str++) = 0;
1805 }
1806#endif
1807 1790
1808 /* 1791 /*
1809 * Decode str into name, index, options. 1792 * Decode str into name, index, options.
@@ -1858,15 +1841,15 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
1858 struct console_cmdline *c; 1841 struct console_cmdline *c;
1859 int i; 1842 int i;
1860 1843
1861 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) 1844 for (i = 0, c = console_cmdline;
1862 if (strcmp(console_cmdline[i].name, name) == 0 && 1845 i < MAX_CMDLINECONSOLES && c->name[0];
1863 console_cmdline[i].index == idx) { 1846 i++, c++)
1864 c = &console_cmdline[i]; 1847 if (strcmp(c->name, name) == 0 && c->index == idx) {
1865 strlcpy(c->name, name_new, sizeof(c->name)); 1848 strlcpy(c->name, name_new, sizeof(c->name));
1866 c->name[sizeof(c->name) - 1] = 0; 1849 c->name[sizeof(c->name) - 1] = 0;
1867 c->options = options; 1850 c->options = options;
1868 c->index = idx_new; 1851 c->index = idx_new;
1869 return i; 1852 return i;
1870 } 1853 }
1871 /* not found */ 1854 /* not found */
1872 return -1; 1855 return -1;
@@ -1921,7 +1904,7 @@ void resume_console(void)
1921 * called when a new CPU comes online (or fails to come up), and ensures 1904 * called when a new CPU comes online (or fails to come up), and ensures
1922 * that any such output gets printed. 1905 * that any such output gets printed.
1923 */ 1906 */
1924static int __cpuinit console_cpu_notify(struct notifier_block *self, 1907static int console_cpu_notify(struct notifier_block *self,
1925 unsigned long action, void *hcpu) 1908 unsigned long action, void *hcpu)
1926{ 1909{
1927 switch (action) { 1910 switch (action) {
@@ -2046,7 +2029,7 @@ void console_unlock(void)
2046 console_cont_flush(text, sizeof(text)); 2029 console_cont_flush(text, sizeof(text));
2047again: 2030again:
2048 for (;;) { 2031 for (;;) {
2049 struct log *msg; 2032 struct printk_log *msg;
2050 size_t len; 2033 size_t len;
2051 int level; 2034 int level;
2052 2035
@@ -2241,6 +2224,7 @@ void register_console(struct console *newcon)
2241 int i; 2224 int i;
2242 unsigned long flags; 2225 unsigned long flags;
2243 struct console *bcon = NULL; 2226 struct console *bcon = NULL;
2227 struct console_cmdline *c;
2244 2228
2245 /* 2229 /*
2246 * before we register a new CON_BOOT console, make sure we don't 2230 * before we register a new CON_BOOT console, make sure we don't
@@ -2288,30 +2272,25 @@ void register_console(struct console *newcon)
2288 * See if this console matches one we selected on 2272 * See if this console matches one we selected on
2289 * the command line. 2273 * the command line.
2290 */ 2274 */
2291 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; 2275 for (i = 0, c = console_cmdline;
2292 i++) { 2276 i < MAX_CMDLINECONSOLES && c->name[0];
2293 if (strcmp(console_cmdline[i].name, newcon->name) != 0) 2277 i++, c++) {
2278 if (strcmp(c->name, newcon->name) != 0)
2294 continue; 2279 continue;
2295 if (newcon->index >= 0 && 2280 if (newcon->index >= 0 &&
2296 newcon->index != console_cmdline[i].index) 2281 newcon->index != c->index)
2297 continue; 2282 continue;
2298 if (newcon->index < 0) 2283 if (newcon->index < 0)
2299 newcon->index = console_cmdline[i].index; 2284 newcon->index = c->index;
2300#ifdef CONFIG_A11Y_BRAILLE_CONSOLE 2285
2301 if (console_cmdline[i].brl_options) { 2286 if (_braille_register_console(newcon, c))
2302 newcon->flags |= CON_BRL;
2303 braille_register_console(newcon,
2304 console_cmdline[i].index,
2305 console_cmdline[i].options,
2306 console_cmdline[i].brl_options);
2307 return; 2287 return;
2308 } 2288
2309#endif
2310 if (newcon->setup && 2289 if (newcon->setup &&
2311 newcon->setup(newcon, console_cmdline[i].options) != 0) 2290 newcon->setup(newcon, console_cmdline[i].options) != 0)
2312 break; 2291 break;
2313 newcon->flags |= CON_ENABLED; 2292 newcon->flags |= CON_ENABLED;
2314 newcon->index = console_cmdline[i].index; 2293 newcon->index = c->index;
2315 if (i == selected_console) { 2294 if (i == selected_console) {
2316 newcon->flags |= CON_CONSDEV; 2295 newcon->flags |= CON_CONSDEV;
2317 preferred_console = selected_console; 2296 preferred_console = selected_console;
@@ -2394,13 +2373,13 @@ EXPORT_SYMBOL(register_console);
2394int unregister_console(struct console *console) 2373int unregister_console(struct console *console)
2395{ 2374{
2396 struct console *a, *b; 2375 struct console *a, *b;
2397 int res = 1; 2376 int res;
2398 2377
2399#ifdef CONFIG_A11Y_BRAILLE_CONSOLE 2378 res = _braille_unregister_console(console);
2400 if (console->flags & CON_BRL) 2379 if (res)
2401 return braille_unregister_console(console); 2380 return res;
2402#endif
2403 2381
2382 res = 1;
2404 console_lock(); 2383 console_lock();
2405 if (console_drivers == console) { 2384 if (console_drivers == console) {
2406 console_drivers=console->next; 2385 console_drivers=console->next;
@@ -2666,7 +2645,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
2666bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, 2645bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
2667 char *line, size_t size, size_t *len) 2646 char *line, size_t size, size_t *len)
2668{ 2647{
2669 struct log *msg; 2648 struct printk_log *msg;
2670 size_t l = 0; 2649 size_t l = 0;
2671 bool ret = false; 2650 bool ret = false;
2672 2651
@@ -2778,7 +2757,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
2778 idx = dumper->cur_idx; 2757 idx = dumper->cur_idx;
2779 prev = 0; 2758 prev = 0;
2780 while (seq < dumper->next_seq) { 2759 while (seq < dumper->next_seq) {
2781 struct log *msg = log_from_idx(idx); 2760 struct printk_log *msg = log_from_idx(idx);
2782 2761
2783 l += msg_print_text(msg, prev, true, NULL, 0); 2762 l += msg_print_text(msg, prev, true, NULL, 0);
2784 idx = log_next(idx); 2763 idx = log_next(idx);
@@ -2791,7 +2770,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
2791 idx = dumper->cur_idx; 2770 idx = dumper->cur_idx;
2792 prev = 0; 2771 prev = 0;
2793 while (l > size && seq < dumper->next_seq) { 2772 while (l > size && seq < dumper->next_seq) {
2794 struct log *msg = log_from_idx(idx); 2773 struct printk_log *msg = log_from_idx(idx);
2795 2774
2796 l -= msg_print_text(msg, prev, true, NULL, 0); 2775 l -= msg_print_text(msg, prev, true, NULL, 0);
2797 idx = log_next(idx); 2776 idx = log_next(idx);
@@ -2806,7 +2785,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
2806 l = 0; 2785 l = 0;
2807 prev = 0; 2786 prev = 0;
2808 while (seq < dumper->next_seq) { 2787 while (seq < dumper->next_seq) {
2809 struct log *msg = log_from_idx(idx); 2788 struct printk_log *msg = log_from_idx(idx);
2810 2789
2811 l += msg_print_text(msg, prev, syslog, buf + l, size - l); 2790 l += msg_print_text(msg, prev, syslog, buf + l, size - l);
2812 idx = log_next(idx); 2791 idx = log_next(idx);
diff --git a/kernel/profile.c b/kernel/profile.c
index 0bf400737660..6631e1ef55ab 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -331,7 +331,7 @@ out:
331 put_cpu(); 331 put_cpu();
332} 332}
333 333
334static int __cpuinit profile_cpu_callback(struct notifier_block *info, 334static int profile_cpu_callback(struct notifier_block *info,
335 unsigned long action, void *__cpu) 335 unsigned long action, void *__cpu)
336{ 336{
337 int node, cpu = (unsigned long)__cpu; 337 int node, cpu = (unsigned long)__cpu;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 335a7ae697f5..a146ee327f6a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -844,6 +844,47 @@ int ptrace_request(struct task_struct *child, long request,
844 ret = ptrace_setsiginfo(child, &siginfo); 844 ret = ptrace_setsiginfo(child, &siginfo);
845 break; 845 break;
846 846
847 case PTRACE_GETSIGMASK:
848 if (addr != sizeof(sigset_t)) {
849 ret = -EINVAL;
850 break;
851 }
852
853 if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t)))
854 ret = -EFAULT;
855 else
856 ret = 0;
857
858 break;
859
860 case PTRACE_SETSIGMASK: {
861 sigset_t new_set;
862
863 if (addr != sizeof(sigset_t)) {
864 ret = -EINVAL;
865 break;
866 }
867
868 if (copy_from_user(&new_set, datavp, sizeof(sigset_t))) {
869 ret = -EFAULT;
870 break;
871 }
872
873 sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
874
875 /*
876 * Every thread does recalc_sigpending() after resume, so
877 * retarget_shared_pending() and recalc_sigpending() are not
878 * called here.
879 */
880 spin_lock_irq(&child->sighand->siglock);
881 child->blocked = new_set;
882 spin_unlock_irq(&child->sighand->siglock);
883
884 ret = 0;
885 break;
886 }
887
847 case PTRACE_INTERRUPT: 888 case PTRACE_INTERRUPT:
848 /* 889 /*
849 * Stop tracee without any side-effect on signal or job 890 * Stop tracee without any side-effect on signal or job
@@ -948,8 +989,7 @@ int ptrace_request(struct task_struct *child, long request,
948 989
949#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 990#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
950 case PTRACE_GETREGSET: 991 case PTRACE_GETREGSET:
951 case PTRACE_SETREGSET: 992 case PTRACE_SETREGSET: {
952 {
953 struct iovec kiov; 993 struct iovec kiov;
954 struct iovec __user *uiov = datavp; 994 struct iovec __user *uiov = datavp;
955 995
@@ -1181,19 +1221,3 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
1181 return ret; 1221 return ret;
1182} 1222}
1183#endif /* CONFIG_COMPAT */ 1223#endif /* CONFIG_COMPAT */
1184
1185#ifdef CONFIG_HAVE_HW_BREAKPOINT
1186int ptrace_get_breakpoints(struct task_struct *tsk)
1187{
1188 if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt))
1189 return 0;
1190
1191 return -1;
1192}
1193
1194void ptrace_put_breakpoints(struct task_struct *tsk)
1195{
1196 if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt))
1197 flush_ptrace_hw_breakpoint(tsk);
1198}
1199#endif /* CONFIG_HAVE_HW_BREAKPOINT */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 48ab70384a4c..cce6ba8bbace 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -104,31 +104,7 @@ void __rcu_read_unlock(void)
104} 104}
105EXPORT_SYMBOL_GPL(__rcu_read_unlock); 105EXPORT_SYMBOL_GPL(__rcu_read_unlock);
106 106
107/* 107#endif /* #ifdef CONFIG_PREEMPT_RCU */
108 * Check for a task exiting while in a preemptible-RCU read-side
109 * critical section, clean up if so. No need to issue warnings,
110 * as debug_check_no_locks_held() already does this if lockdep
111 * is enabled.
112 */
113void exit_rcu(void)
114{
115 struct task_struct *t = current;
116
117 if (likely(list_empty(&current->rcu_node_entry)))
118 return;
119 t->rcu_read_lock_nesting = 1;
120 barrier();
121 t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
122 __rcu_read_unlock();
123}
124
125#else /* #ifdef CONFIG_PREEMPT_RCU */
126
127void exit_rcu(void)
128{
129}
130
131#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
132 108
133#ifdef CONFIG_DEBUG_LOCK_ALLOC 109#ifdef CONFIG_DEBUG_LOCK_ALLOC
134static struct lock_class_key rcu_lock_key; 110static struct lock_class_key rcu_lock_key;
@@ -145,9 +121,6 @@ static struct lock_class_key rcu_sched_lock_key;
145struct lockdep_map rcu_sched_lock_map = 121struct lockdep_map rcu_sched_lock_map =
146 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); 122 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
147EXPORT_SYMBOL_GPL(rcu_sched_lock_map); 123EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
148#endif
149
150#ifdef CONFIG_DEBUG_LOCK_ALLOC
151 124
152int debug_lockdep_rcu_enabled(void) 125int debug_lockdep_rcu_enabled(void)
153{ 126{
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index a0714a51b6d7..aa344111de3e 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -44,7 +44,6 @@
44 44
45/* Forward declarations for rcutiny_plugin.h. */ 45/* Forward declarations for rcutiny_plugin.h. */
46struct rcu_ctrlblk; 46struct rcu_ctrlblk;
47static void invoke_rcu_callbacks(void);
48static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); 47static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
49static void rcu_process_callbacks(struct softirq_action *unused); 48static void rcu_process_callbacks(struct softirq_action *unused);
50static void __call_rcu(struct rcu_head *head, 49static void __call_rcu(struct rcu_head *head,
@@ -205,7 +204,7 @@ static int rcu_is_cpu_rrupt_from_idle(void)
205 */ 204 */
206static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 205static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
207{ 206{
208 reset_cpu_stall_ticks(rcp); 207 RCU_TRACE(reset_cpu_stall_ticks(rcp));
209 if (rcp->rcucblist != NULL && 208 if (rcp->rcucblist != NULL &&
210 rcp->donetail != rcp->curtail) { 209 rcp->donetail != rcp->curtail) {
211 rcp->donetail = rcp->curtail; 210 rcp->donetail = rcp->curtail;
@@ -227,7 +226,7 @@ void rcu_sched_qs(int cpu)
227 local_irq_save(flags); 226 local_irq_save(flags);
228 if (rcu_qsctr_help(&rcu_sched_ctrlblk) + 227 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
229 rcu_qsctr_help(&rcu_bh_ctrlblk)) 228 rcu_qsctr_help(&rcu_bh_ctrlblk))
230 invoke_rcu_callbacks(); 229 raise_softirq(RCU_SOFTIRQ);
231 local_irq_restore(flags); 230 local_irq_restore(flags);
232} 231}
233 232
@@ -240,7 +239,7 @@ void rcu_bh_qs(int cpu)
240 239
241 local_irq_save(flags); 240 local_irq_save(flags);
242 if (rcu_qsctr_help(&rcu_bh_ctrlblk)) 241 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
243 invoke_rcu_callbacks(); 242 raise_softirq(RCU_SOFTIRQ);
244 local_irq_restore(flags); 243 local_irq_restore(flags);
245} 244}
246 245
@@ -252,12 +251,11 @@ void rcu_bh_qs(int cpu)
252 */ 251 */
253void rcu_check_callbacks(int cpu, int user) 252void rcu_check_callbacks(int cpu, int user)
254{ 253{
255 check_cpu_stalls(); 254 RCU_TRACE(check_cpu_stalls());
256 if (user || rcu_is_cpu_rrupt_from_idle()) 255 if (user || rcu_is_cpu_rrupt_from_idle())
257 rcu_sched_qs(cpu); 256 rcu_sched_qs(cpu);
258 else if (!in_softirq()) 257 else if (!in_softirq())
259 rcu_bh_qs(cpu); 258 rcu_bh_qs(cpu);
260 rcu_preempt_check_callbacks();
261} 259}
262 260
263/* 261/*
@@ -278,7 +276,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
278 ACCESS_ONCE(rcp->rcucblist), 276 ACCESS_ONCE(rcp->rcucblist),
279 need_resched(), 277 need_resched(),
280 is_idle_task(current), 278 is_idle_task(current),
281 rcu_is_callbacks_kthread())); 279 false));
282 return; 280 return;
283 } 281 }
284 282
@@ -290,7 +288,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
290 *rcp->donetail = NULL; 288 *rcp->donetail = NULL;
291 if (rcp->curtail == rcp->donetail) 289 if (rcp->curtail == rcp->donetail)
292 rcp->curtail = &rcp->rcucblist; 290 rcp->curtail = &rcp->rcucblist;
293 rcu_preempt_remove_callbacks(rcp);
294 rcp->donetail = &rcp->rcucblist; 291 rcp->donetail = &rcp->rcucblist;
295 local_irq_restore(flags); 292 local_irq_restore(flags);
296 293
@@ -309,14 +306,13 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
309 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); 306 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
310 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), 307 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
311 is_idle_task(current), 308 is_idle_task(current),
312 rcu_is_callbacks_kthread())); 309 false));
313} 310}
314 311
315static void rcu_process_callbacks(struct softirq_action *unused) 312static void rcu_process_callbacks(struct softirq_action *unused)
316{ 313{
317 __rcu_process_callbacks(&rcu_sched_ctrlblk); 314 __rcu_process_callbacks(&rcu_sched_ctrlblk);
318 __rcu_process_callbacks(&rcu_bh_ctrlblk); 315 __rcu_process_callbacks(&rcu_bh_ctrlblk);
319 rcu_preempt_process_callbacks();
320} 316}
321 317
322/* 318/*
@@ -382,3 +378,8 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
382 __call_rcu(head, func, &rcu_bh_ctrlblk); 378 __call_rcu(head, func, &rcu_bh_ctrlblk);
383} 379}
384EXPORT_SYMBOL_GPL(call_rcu_bh); 380EXPORT_SYMBOL_GPL(call_rcu_bh);
381
382void rcu_init(void)
383{
384 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
385}
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 8a233002faeb..0cd385acccfa 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -53,958 +53,10 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
53}; 53};
54 54
55#ifdef CONFIG_DEBUG_LOCK_ALLOC 55#ifdef CONFIG_DEBUG_LOCK_ALLOC
56#include <linux/kernel_stat.h>
57
56int rcu_scheduler_active __read_mostly; 58int rcu_scheduler_active __read_mostly;
57EXPORT_SYMBOL_GPL(rcu_scheduler_active); 59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
58#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
59
60#ifdef CONFIG_RCU_TRACE
61
62static void check_cpu_stall(struct rcu_ctrlblk *rcp)
63{
64 unsigned long j;
65 unsigned long js;
66
67 if (rcu_cpu_stall_suppress)
68 return;
69 rcp->ticks_this_gp++;
70 j = jiffies;
71 js = rcp->jiffies_stall;
72 if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
73 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
74 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
75 jiffies - rcp->gp_start, rcp->qlen);
76 dump_stack();
77 }
78 if (*rcp->curtail && ULONG_CMP_GE(j, js))
79 rcp->jiffies_stall = jiffies +
80 3 * rcu_jiffies_till_stall_check() + 3;
81 else if (ULONG_CMP_GE(j, js))
82 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
83}
84
85static void check_cpu_stall_preempt(void);
86
87#endif /* #ifdef CONFIG_RCU_TRACE */
88
89static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
90{
91#ifdef CONFIG_RCU_TRACE
92 rcp->ticks_this_gp = 0;
93 rcp->gp_start = jiffies;
94 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
95#endif /* #ifdef CONFIG_RCU_TRACE */
96}
97
98static void check_cpu_stalls(void)
99{
100 RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
101 RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
102 RCU_TRACE(check_cpu_stall_preempt());
103}
104
105#ifdef CONFIG_TINY_PREEMPT_RCU
106
107#include <linux/delay.h>
108
109/* Global control variables for preemptible RCU. */
110struct rcu_preempt_ctrlblk {
111 struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */
112 struct rcu_head **nexttail;
113 /* Tasks blocked in a preemptible RCU */
114 /* read-side critical section while an */
115 /* preemptible-RCU grace period is in */
116 /* progress must wait for a later grace */
117 /* period. This pointer points to the */
118 /* ->next pointer of the last task that */
119 /* must wait for a later grace period, or */
120 /* to &->rcb.rcucblist if there is no */
121 /* such task. */
122 struct list_head blkd_tasks;
123 /* Tasks blocked in RCU read-side critical */
124 /* section. Tasks are placed at the head */
125 /* of this list and age towards the tail. */
126 struct list_head *gp_tasks;
127 /* Pointer to the first task blocking the */
128 /* current grace period, or NULL if there */
129 /* is no such task. */
130 struct list_head *exp_tasks;
131 /* Pointer to first task blocking the */
132 /* current expedited grace period, or NULL */
133 /* if there is no such task. If there */
134 /* is no current expedited grace period, */
135 /* then there cannot be any such task. */
136#ifdef CONFIG_RCU_BOOST
137 struct list_head *boost_tasks;
138 /* Pointer to first task that needs to be */
139 /* priority-boosted, or NULL if no priority */
140 /* boosting is needed. If there is no */
141 /* current or expedited grace period, there */
142 /* can be no such task. */
143#endif /* #ifdef CONFIG_RCU_BOOST */
144 u8 gpnum; /* Current grace period. */
145 u8 gpcpu; /* Last grace period blocked by the CPU. */
146 u8 completed; /* Last grace period completed. */
147 /* If all three are equal, RCU is idle. */
148#ifdef CONFIG_RCU_BOOST
149 unsigned long boost_time; /* When to start boosting (jiffies) */
150#endif /* #ifdef CONFIG_RCU_BOOST */
151#ifdef CONFIG_RCU_TRACE
152 unsigned long n_grace_periods;
153#ifdef CONFIG_RCU_BOOST
154 unsigned long n_tasks_boosted;
155 /* Total number of tasks boosted. */
156 unsigned long n_exp_boosts;
157 /* Number of tasks boosted for expedited GP. */
158 unsigned long n_normal_boosts;
159 /* Number of tasks boosted for normal GP. */
160 unsigned long n_balk_blkd_tasks;
161 /* Refused to boost: no blocked tasks. */
162 unsigned long n_balk_exp_gp_tasks;
163 /* Refused to boost: nothing blocking GP. */
164 unsigned long n_balk_boost_tasks;
165 /* Refused to boost: already boosting. */
166 unsigned long n_balk_notyet;
167 /* Refused to boost: not yet time. */
168 unsigned long n_balk_nos;
169 /* Refused to boost: not sure why, though. */
170 /* This can happen due to race conditions. */
171#endif /* #ifdef CONFIG_RCU_BOOST */
172#endif /* #ifdef CONFIG_RCU_TRACE */
173};
174
175static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
176 .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
177 .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
178 .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
179 .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
180 RCU_TRACE(.rcb.name = "rcu_preempt")
181};
182
183static int rcu_preempted_readers_exp(void);
184static void rcu_report_exp_done(void);
185
186/*
187 * Return true if the CPU has not yet responded to the current grace period.
188 */
189static int rcu_cpu_blocking_cur_gp(void)
190{
191 return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
192}
193
194/*
195 * Check for a running RCU reader. Because there is only one CPU,
196 * there can be but one running RCU reader at a time. ;-)
197 *
198 * Returns zero if there are no running readers. Returns a positive
199 * number if there is at least one reader within its RCU read-side
200 * critical section. Returns a negative number if an outermost reader
201 * is in the midst of exiting from its RCU read-side critical section
202 *
203 * Returns zero if there are no running readers. Returns a positive
204 * number if there is at least one reader within its RCU read-side
205 * critical section. Returns a negative number if an outermost reader
206 * is in the midst of exiting from its RCU read-side critical section.
207 */
208static int rcu_preempt_running_reader(void)
209{
210 return current->rcu_read_lock_nesting;
211}
212
213/*
214 * Check for preempted RCU readers blocking any grace period.
215 * If the caller needs a reliable answer, it must disable hard irqs.
216 */
217static int rcu_preempt_blocked_readers_any(void)
218{
219 return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
220}
221
222/*
223 * Check for preempted RCU readers blocking the current grace period.
224 * If the caller needs a reliable answer, it must disable hard irqs.
225 */
226static int rcu_preempt_blocked_readers_cgp(void)
227{
228 return rcu_preempt_ctrlblk.gp_tasks != NULL;
229}
230
231/*
232 * Return true if another preemptible-RCU grace period is needed.
233 */
234static int rcu_preempt_needs_another_gp(void)
235{
236 return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
237}
238
239/*
240 * Return true if a preemptible-RCU grace period is in progress.
241 * The caller must disable hardirqs.
242 */
243static int rcu_preempt_gp_in_progress(void)
244{
245 return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
246}
247
248/*
249 * Advance a ->blkd_tasks-list pointer to the next entry, instead
250 * returning NULL if at the end of the list.
251 */
252static struct list_head *rcu_next_node_entry(struct task_struct *t)
253{
254 struct list_head *np;
255
256 np = t->rcu_node_entry.next;
257 if (np == &rcu_preempt_ctrlblk.blkd_tasks)
258 np = NULL;
259 return np;
260}
261
262#ifdef CONFIG_RCU_TRACE
263
264#ifdef CONFIG_RCU_BOOST
265static void rcu_initiate_boost_trace(void);
266#endif /* #ifdef CONFIG_RCU_BOOST */
267
268/*
269 * Dump additional statistice for TINY_PREEMPT_RCU.
270 */
271static void show_tiny_preempt_stats(struct seq_file *m)
272{
273 seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
274 rcu_preempt_ctrlblk.rcb.qlen,
275 rcu_preempt_ctrlblk.n_grace_periods,
276 rcu_preempt_ctrlblk.gpnum,
277 rcu_preempt_ctrlblk.gpcpu,
278 rcu_preempt_ctrlblk.completed,
279 "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
280 "N."[!rcu_preempt_ctrlblk.gp_tasks],
281 "E."[!rcu_preempt_ctrlblk.exp_tasks]);
282#ifdef CONFIG_RCU_BOOST
283 seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
284 " ",
285 "B."[!rcu_preempt_ctrlblk.boost_tasks],
286 rcu_preempt_ctrlblk.n_tasks_boosted,
287 rcu_preempt_ctrlblk.n_exp_boosts,
288 rcu_preempt_ctrlblk.n_normal_boosts,
289 (int)(jiffies & 0xffff),
290 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
291 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n",
292 " balk",
293 rcu_preempt_ctrlblk.n_balk_blkd_tasks,
294 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks,
295 rcu_preempt_ctrlblk.n_balk_boost_tasks,
296 rcu_preempt_ctrlblk.n_balk_notyet,
297 rcu_preempt_ctrlblk.n_balk_nos);
298#endif /* #ifdef CONFIG_RCU_BOOST */
299}
300
301#endif /* #ifdef CONFIG_RCU_TRACE */
302
303#ifdef CONFIG_RCU_BOOST
304
305#include "rtmutex_common.h"
306
307#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
308
309/* Controls for rcu_kthread() kthread. */
310static struct task_struct *rcu_kthread_task;
311static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
312static unsigned long have_rcu_kthread_work;
313
314/*
315 * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
316 * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
317 */
318static int rcu_boost(void)
319{
320 unsigned long flags;
321 struct rt_mutex mtx;
322 struct task_struct *t;
323 struct list_head *tb;
324
325 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
326 rcu_preempt_ctrlblk.exp_tasks == NULL)
327 return 0; /* Nothing to boost. */
328
329 local_irq_save(flags);
330
331 /*
332 * Recheck with irqs disabled: all tasks in need of boosting
333 * might exit their RCU read-side critical sections on their own
334 * if we are preempted just before disabling irqs.
335 */
336 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
337 rcu_preempt_ctrlblk.exp_tasks == NULL) {
338 local_irq_restore(flags);
339 return 0;
340 }
341
342 /*
343 * Preferentially boost tasks blocking expedited grace periods.
344 * This cannot starve the normal grace periods because a second
345 * expedited grace period must boost all blocked tasks, including
346 * those blocking the pre-existing normal grace period.
347 */
348 if (rcu_preempt_ctrlblk.exp_tasks != NULL) {
349 tb = rcu_preempt_ctrlblk.exp_tasks;
350 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
351 } else {
352 tb = rcu_preempt_ctrlblk.boost_tasks;
353 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
354 }
355 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
356
357 /*
358 * We boost task t by manufacturing an rt_mutex that appears to
359 * be held by task t. We leave a pointer to that rt_mutex where
360 * task t can find it, and task t will release the mutex when it
361 * exits its outermost RCU read-side critical section. Then
362 * simply acquiring this artificial rt_mutex will boost task
363 * t's priority. (Thanks to tglx for suggesting this approach!)
364 */
365 t = container_of(tb, struct task_struct, rcu_node_entry);
366 rt_mutex_init_proxy_locked(&mtx, t);
367 t->rcu_boost_mutex = &mtx;
368 local_irq_restore(flags);
369 rt_mutex_lock(&mtx);
370 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
371
372 return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL ||
373 ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL;
374}
375
376/*
377 * Check to see if it is now time to start boosting RCU readers blocking
378 * the current grace period, and, if so, tell the rcu_kthread_task to
379 * start boosting them. If there is an expedited boost in progress,
380 * we wait for it to complete.
381 *
382 * If there are no blocked readers blocking the current grace period,
383 * return 0 to let the caller know, otherwise return 1. Note that this
384 * return value is independent of whether or not boosting was done.
385 */
386static int rcu_initiate_boost(void)
387{
388 if (!rcu_preempt_blocked_readers_cgp() &&
389 rcu_preempt_ctrlblk.exp_tasks == NULL) {
390 RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++);
391 return 0;
392 }
393 if (rcu_preempt_ctrlblk.exp_tasks != NULL ||
394 (rcu_preempt_ctrlblk.gp_tasks != NULL &&
395 rcu_preempt_ctrlblk.boost_tasks == NULL &&
396 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) {
397 if (rcu_preempt_ctrlblk.exp_tasks == NULL)
398 rcu_preempt_ctrlblk.boost_tasks =
399 rcu_preempt_ctrlblk.gp_tasks;
400 invoke_rcu_callbacks();
401 } else {
402 RCU_TRACE(rcu_initiate_boost_trace());
403 }
404 return 1;
405}
406
407#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
408
409/*
410 * Do priority-boost accounting for the start of a new grace period.
411 */
412static void rcu_preempt_boost_start_gp(void)
413{
414 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
415}
416
417#else /* #ifdef CONFIG_RCU_BOOST */
418
419/*
420 * If there is no RCU priority boosting, we don't initiate boosting,
421 * but we do indicate whether there are blocked readers blocking the
422 * current grace period.
423 */
424static int rcu_initiate_boost(void)
425{
426 return rcu_preempt_blocked_readers_cgp();
427}
428
429/*
430 * If there is no RCU priority boosting, nothing to do at grace-period start.
431 */
432static void rcu_preempt_boost_start_gp(void)
433{
434}
435
436#endif /* else #ifdef CONFIG_RCU_BOOST */
437
438/*
439 * Record a preemptible-RCU quiescent state for the specified CPU. Note
440 * that this just means that the task currently running on the CPU is
441 * in a quiescent state. There might be any number of tasks blocked
442 * while in an RCU read-side critical section.
443 *
444 * Unlike the other rcu_*_qs() functions, callers to this function
445 * must disable irqs in order to protect the assignment to
446 * ->rcu_read_unlock_special.
447 *
448 * Because this is a single-CPU implementation, the only way a grace
449 * period can end is if the CPU is in a quiescent state. The reason is
450 * that a blocked preemptible-RCU reader can exit its critical section
451 * only if the CPU is running it at the time. Therefore, when the
452 * last task blocking the current grace period exits its RCU read-side
453 * critical section, neither the CPU nor blocked tasks will be stopping
454 * the current grace period. (In contrast, SMP implementations
455 * might have CPUs running in RCU read-side critical sections that
456 * block later grace periods -- but this is not possible given only
457 * one CPU.)
458 */
459static void rcu_preempt_cpu_qs(void)
460{
461 /* Record both CPU and task as having responded to current GP. */
462 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
463 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
464
465 /* If there is no GP then there is nothing more to do. */
466 if (!rcu_preempt_gp_in_progress())
467 return;
468 /*
469 * Check up on boosting. If there are readers blocking the
470 * current grace period, leave.
471 */
472 if (rcu_initiate_boost())
473 return;
474
475 /* Advance callbacks. */
476 rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
477 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
478 rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
479
480 /* If there are no blocked readers, next GP is done instantly. */
481 if (!rcu_preempt_blocked_readers_any())
482 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
483
484 /* If there are done callbacks, cause them to be invoked. */
485 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
486 invoke_rcu_callbacks();
487}
488
489/*
490 * Start a new RCU grace period if warranted. Hard irqs must be disabled.
491 */
492static void rcu_preempt_start_gp(void)
493{
494 if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
495
496 /* Official start of GP. */
497 rcu_preempt_ctrlblk.gpnum++;
498 RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
499 reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb);
500
501 /* Any blocked RCU readers block new GP. */
502 if (rcu_preempt_blocked_readers_any())
503 rcu_preempt_ctrlblk.gp_tasks =
504 rcu_preempt_ctrlblk.blkd_tasks.next;
505
506 /* Set up for RCU priority boosting. */
507 rcu_preempt_boost_start_gp();
508
509 /* If there is no running reader, CPU is done with GP. */
510 if (!rcu_preempt_running_reader())
511 rcu_preempt_cpu_qs();
512 }
513}
514
515/*
516 * We have entered the scheduler, and the current task might soon be
517 * context-switched away from. If this task is in an RCU read-side
518 * critical section, we will no longer be able to rely on the CPU to
519 * record that fact, so we enqueue the task on the blkd_tasks list.
520 * If the task started after the current grace period began, as recorded
521 * by ->gpcpu, we enqueue at the beginning of the list. Otherwise
522 * before the element referenced by ->gp_tasks (or at the tail if
523 * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
524 * The task will dequeue itself when it exits the outermost enclosing
525 * RCU read-side critical section. Therefore, the current grace period
526 * cannot be permitted to complete until the ->gp_tasks pointer becomes
527 * NULL.
528 *
529 * Caller must disable preemption.
530 */
531void rcu_preempt_note_context_switch(void)
532{
533 struct task_struct *t = current;
534 unsigned long flags;
535
536 local_irq_save(flags); /* must exclude scheduler_tick(). */
537 if (rcu_preempt_running_reader() > 0 &&
538 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
539
540 /* Possibly blocking in an RCU read-side critical section. */
541 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
542
543 /*
544 * If this CPU has already checked in, then this task
545 * will hold up the next grace period rather than the
546 * current grace period. Queue the task accordingly.
547 * If the task is queued for the current grace period
548 * (i.e., this CPU has not yet passed through a quiescent
549 * state for the current grace period), then as long
550 * as that task remains queued, the current grace period
551 * cannot end.
552 */
553 list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
554 if (rcu_cpu_blocking_cur_gp())
555 rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
556 } else if (rcu_preempt_running_reader() < 0 &&
557 t->rcu_read_unlock_special) {
558 /*
559 * Complete exit from RCU read-side critical section on
560 * behalf of preempted instance of __rcu_read_unlock().
561 */
562 rcu_read_unlock_special(t);
563 }
564
565 /*
566 * Either we were not in an RCU read-side critical section to
567 * begin with, or we have now recorded that critical section
568 * globally. Either way, we can now note a quiescent state
569 * for this CPU. Again, if we were in an RCU read-side critical
570 * section, and if that critical section was blocking the current
571 * grace period, then the fact that the task has been enqueued
572 * means that current grace period continues to be blocked.
573 */
574 rcu_preempt_cpu_qs();
575 local_irq_restore(flags);
576}
577
578/*
579 * Handle special cases during rcu_read_unlock(), such as needing to
580 * notify RCU core processing or task having blocked during the RCU
581 * read-side critical section.
582 */
583void rcu_read_unlock_special(struct task_struct *t)
584{
585 int empty;
586 int empty_exp;
587 unsigned long flags;
588 struct list_head *np;
589#ifdef CONFIG_RCU_BOOST
590 struct rt_mutex *rbmp = NULL;
591#endif /* #ifdef CONFIG_RCU_BOOST */
592 int special;
593
594 /*
595 * NMI handlers cannot block and cannot safely manipulate state.
596 * They therefore cannot possibly be special, so just leave.
597 */
598 if (in_nmi())
599 return;
600
601 local_irq_save(flags);
602
603 /*
604 * If RCU core is waiting for this CPU to exit critical section,
605 * let it know that we have done so.
606 */
607 special = t->rcu_read_unlock_special;
608 if (special & RCU_READ_UNLOCK_NEED_QS)
609 rcu_preempt_cpu_qs();
610
611 /* Hardware IRQ handlers cannot block. */
612 if (in_irq() || in_serving_softirq()) {
613 local_irq_restore(flags);
614 return;
615 }
616
617 /* Clean up if blocked during RCU read-side critical section. */
618 if (special & RCU_READ_UNLOCK_BLOCKED) {
619 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
620
621 /*
622 * Remove this task from the ->blkd_tasks list and adjust
623 * any pointers that might have been referencing it.
624 */
625 empty = !rcu_preempt_blocked_readers_cgp();
626 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
627 np = rcu_next_node_entry(t);
628 list_del_init(&t->rcu_node_entry);
629 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
630 rcu_preempt_ctrlblk.gp_tasks = np;
631 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
632 rcu_preempt_ctrlblk.exp_tasks = np;
633#ifdef CONFIG_RCU_BOOST
634 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
635 rcu_preempt_ctrlblk.boost_tasks = np;
636#endif /* #ifdef CONFIG_RCU_BOOST */
637
638 /*
639 * If this was the last task on the current list, and if
640 * we aren't waiting on the CPU, report the quiescent state
641 * and start a new grace period if needed.
642 */
643 if (!empty && !rcu_preempt_blocked_readers_cgp()) {
644 rcu_preempt_cpu_qs();
645 rcu_preempt_start_gp();
646 }
647
648 /*
649 * If this was the last task on the expedited lists,
650 * then we need wake up the waiting task.
651 */
652 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
653 rcu_report_exp_done();
654 }
655#ifdef CONFIG_RCU_BOOST
656 /* Unboost self if was boosted. */
657 if (t->rcu_boost_mutex != NULL) {
658 rbmp = t->rcu_boost_mutex;
659 t->rcu_boost_mutex = NULL;
660 rt_mutex_unlock(rbmp);
661 }
662#endif /* #ifdef CONFIG_RCU_BOOST */
663 local_irq_restore(flags);
664}
665
666/*
667 * Check for a quiescent state from the current CPU. When a task blocks,
668 * the task is recorded in the rcu_preempt_ctrlblk structure, which is
669 * checked elsewhere. This is called from the scheduling-clock interrupt.
670 *
671 * Caller must disable hard irqs.
672 */
673static void rcu_preempt_check_callbacks(void)
674{
675 struct task_struct *t = current;
676
677 if (rcu_preempt_gp_in_progress() &&
678 (!rcu_preempt_running_reader() ||
679 !rcu_cpu_blocking_cur_gp()))
680 rcu_preempt_cpu_qs();
681 if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
682 rcu_preempt_ctrlblk.rcb.donetail)
683 invoke_rcu_callbacks();
684 if (rcu_preempt_gp_in_progress() &&
685 rcu_cpu_blocking_cur_gp() &&
686 rcu_preempt_running_reader() > 0)
687 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
688}
689
690/*
691 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
692 * update, so this is invoked from rcu_process_callbacks() to
693 * handle that case. Of course, it is invoked for all flavors of
694 * RCU, but RCU callbacks can appear only on one of the lists, and
695 * neither ->nexttail nor ->donetail can possibly be NULL, so there
696 * is no need for an explicit check.
697 */
698static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
699{
700 if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
701 rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
702}
703
704/*
705 * Process callbacks for preemptible RCU.
706 */
707static void rcu_preempt_process_callbacks(void)
708{
709 __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
710}
711
712/*
713 * Queue a preemptible -RCU callback for invocation after a grace period.
714 */
715void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
716{
717 unsigned long flags;
718
719 debug_rcu_head_queue(head);
720 head->func = func;
721 head->next = NULL;
722
723 local_irq_save(flags);
724 *rcu_preempt_ctrlblk.nexttail = head;
725 rcu_preempt_ctrlblk.nexttail = &head->next;
726 RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
727 rcu_preempt_start_gp(); /* checks to see if GP needed. */
728 local_irq_restore(flags);
729}
730EXPORT_SYMBOL_GPL(call_rcu);
731
732/*
733 * synchronize_rcu - wait until a grace period has elapsed.
734 *
735 * Control will return to the caller some time after a full grace
736 * period has elapsed, in other words after all currently executing RCU
737 * read-side critical sections have completed. RCU read-side critical
738 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
739 * and may be nested.
740 */
741void synchronize_rcu(void)
742{
743 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
744 !lock_is_held(&rcu_lock_map) &&
745 !lock_is_held(&rcu_sched_lock_map),
746 "Illegal synchronize_rcu() in RCU read-side critical section");
747
748#ifdef CONFIG_DEBUG_LOCK_ALLOC
749 if (!rcu_scheduler_active)
750 return;
751#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
752
753 WARN_ON_ONCE(rcu_preempt_running_reader());
754 if (!rcu_preempt_blocked_readers_any())
755 return;
756
757 /* Once we get past the fastpath checks, same code as rcu_barrier(). */
758 if (rcu_expedited)
759 synchronize_rcu_expedited();
760 else
761 rcu_barrier();
762}
763EXPORT_SYMBOL_GPL(synchronize_rcu);
764
765static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
766static unsigned long sync_rcu_preempt_exp_count;
767static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
768
769/*
770 * Return non-zero if there are any tasks in RCU read-side critical
771 * sections blocking the current preemptible-RCU expedited grace period.
772 * If there is no preemptible-RCU expedited grace period currently in
773 * progress, returns zero unconditionally.
774 */
775static int rcu_preempted_readers_exp(void)
776{
777 return rcu_preempt_ctrlblk.exp_tasks != NULL;
778}
779
780/*
781 * Report the exit from RCU read-side critical section for the last task
782 * that queued itself during or before the current expedited preemptible-RCU
783 * grace period.
784 */
785static void rcu_report_exp_done(void)
786{
787 wake_up(&sync_rcu_preempt_exp_wq);
788}
789
790/*
791 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
792 * is to rely in the fact that there is but one CPU, and that it is
793 * illegal for a task to invoke synchronize_rcu_expedited() while in a
794 * preemptible-RCU read-side critical section. Therefore, any such
795 * critical sections must correspond to blocked tasks, which must therefore
796 * be on the ->blkd_tasks list. So just record the current head of the
797 * list in the ->exp_tasks pointer, and wait for all tasks including and
798 * after the task pointed to by ->exp_tasks to drain.
799 */
800void synchronize_rcu_expedited(void)
801{
802 unsigned long flags;
803 struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
804 unsigned long snap;
805
806 barrier(); /* ensure prior action seen before grace period. */
807
808 WARN_ON_ONCE(rcu_preempt_running_reader());
809
810 /*
811 * Acquire lock so that there is only one preemptible RCU grace
812 * period in flight. Of course, if someone does the expedited
813 * grace period for us while we are acquiring the lock, just leave.
814 */
815 snap = sync_rcu_preempt_exp_count + 1;
816 mutex_lock(&sync_rcu_preempt_exp_mutex);
817 if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
818 goto unlock_mb_ret; /* Others did our work for us. */
819
820 local_irq_save(flags);
821
822 /*
823 * All RCU readers have to already be on blkd_tasks because
824 * we cannot legally be executing in an RCU read-side critical
825 * section.
826 */
827
828 /* Snapshot current head of ->blkd_tasks list. */
829 rpcp->exp_tasks = rpcp->blkd_tasks.next;
830 if (rpcp->exp_tasks == &rpcp->blkd_tasks)
831 rpcp->exp_tasks = NULL;
832
833 /* Wait for tail of ->blkd_tasks list to drain. */
834 if (!rcu_preempted_readers_exp()) {
835 local_irq_restore(flags);
836 } else {
837 rcu_initiate_boost();
838 local_irq_restore(flags);
839 wait_event(sync_rcu_preempt_exp_wq,
840 !rcu_preempted_readers_exp());
841 }
842
843 /* Clean up and exit. */
844 barrier(); /* ensure expedited GP seen before counter increment. */
845 sync_rcu_preempt_exp_count++;
846unlock_mb_ret:
847 mutex_unlock(&sync_rcu_preempt_exp_mutex);
848 barrier(); /* ensure subsequent action seen after grace period. */
849}
850EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
851
852/*
853 * Does preemptible RCU need the CPU to stay out of dynticks mode?
854 */
855int rcu_preempt_needs_cpu(void)
856{
857 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
858}
859
860#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
861
862#ifdef CONFIG_RCU_TRACE
863
864/*
865 * Because preemptible RCU does not exist, it is not necessary to
866 * dump out its statistics.
867 */
868static void show_tiny_preempt_stats(struct seq_file *m)
869{
870}
871
872#endif /* #ifdef CONFIG_RCU_TRACE */
873
874/*
875 * Because preemptible RCU does not exist, it never has any callbacks
876 * to check.
877 */
878static void rcu_preempt_check_callbacks(void)
879{
880}
881
882/*
883 * Because preemptible RCU does not exist, it never has any callbacks
884 * to remove.
885 */
886static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
887{
888}
889
890/*
891 * Because preemptible RCU does not exist, it never has any callbacks
892 * to process.
893 */
894static void rcu_preempt_process_callbacks(void)
895{
896}
897
898#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
899
900#ifdef CONFIG_RCU_BOOST
901
902/*
903 * Wake up rcu_kthread() to process callbacks now eligible for invocation
904 * or to boost readers.
905 */
906static void invoke_rcu_callbacks(void)
907{
908 have_rcu_kthread_work = 1;
909 if (rcu_kthread_task != NULL)
910 wake_up(&rcu_kthread_wq);
911}
912
913#ifdef CONFIG_RCU_TRACE
914
915/*
916 * Is the current CPU running the RCU-callbacks kthread?
917 * Caller must have preemption disabled.
918 */
919static bool rcu_is_callbacks_kthread(void)
920{
921 return rcu_kthread_task == current;
922}
923
924#endif /* #ifdef CONFIG_RCU_TRACE */
925
926/*
927 * This kthread invokes RCU callbacks whose grace periods have
928 * elapsed. It is awakened as needed, and takes the place of the
929 * RCU_SOFTIRQ that is used for this purpose when boosting is disabled.
930 * This is a kthread, but it is never stopped, at least not until
931 * the system goes down.
932 */
933static int rcu_kthread(void *arg)
934{
935 unsigned long work;
936 unsigned long morework;
937 unsigned long flags;
938
939 for (;;) {
940 wait_event_interruptible(rcu_kthread_wq,
941 have_rcu_kthread_work != 0);
942 morework = rcu_boost();
943 local_irq_save(flags);
944 work = have_rcu_kthread_work;
945 have_rcu_kthread_work = morework;
946 local_irq_restore(flags);
947 if (work)
948 rcu_process_callbacks(NULL);
949 schedule_timeout_interruptible(1); /* Leave CPU for others. */
950 }
951
952 return 0; /* Not reached, but needed to shut gcc up. */
953}
954
955/*
956 * Spawn the kthread that invokes RCU callbacks.
957 */
958static int __init rcu_spawn_kthreads(void)
959{
960 struct sched_param sp;
961
962 rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
963 sp.sched_priority = RCU_BOOST_PRIO;
964 sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
965 return 0;
966}
967early_initcall(rcu_spawn_kthreads);
968
969#else /* #ifdef CONFIG_RCU_BOOST */
970
971/* Hold off callback invocation until early_initcall() time. */
972static int rcu_scheduler_fully_active __read_mostly;
973
974/*
975 * Start up softirq processing of callbacks.
976 */
977void invoke_rcu_callbacks(void)
978{
979 if (rcu_scheduler_fully_active)
980 raise_softirq(RCU_SOFTIRQ);
981}
982
983#ifdef CONFIG_RCU_TRACE
984
985/*
986 * There is no callback kthread, so this thread is never it.
987 */
988static bool rcu_is_callbacks_kthread(void)
989{
990 return false;
991}
992
993#endif /* #ifdef CONFIG_RCU_TRACE */
994
995static int __init rcu_scheduler_really_started(void)
996{
997 rcu_scheduler_fully_active = 1;
998 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
999 raise_softirq(RCU_SOFTIRQ); /* Invoke any callbacks from early boot. */
1000 return 0;
1001}
1002early_initcall(rcu_scheduler_really_started);
1003
1004#endif /* #else #ifdef CONFIG_RCU_BOOST */
1005
1006#ifdef CONFIG_DEBUG_LOCK_ALLOC
1007#include <linux/kernel_stat.h>
1008 60
1009/* 61/*
1010 * During boot, we forgive RCU lockdep issues. After this function is 62 * During boot, we forgive RCU lockdep issues. After this function is
@@ -1020,25 +72,6 @@ void __init rcu_scheduler_starting(void)
1020 72
1021#ifdef CONFIG_RCU_TRACE 73#ifdef CONFIG_RCU_TRACE
1022 74
1023#ifdef CONFIG_RCU_BOOST
1024
1025static void rcu_initiate_boost_trace(void)
1026{
1027 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
1028 rcu_preempt_ctrlblk.n_balk_blkd_tasks++;
1029 else if (rcu_preempt_ctrlblk.gp_tasks == NULL &&
1030 rcu_preempt_ctrlblk.exp_tasks == NULL)
1031 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++;
1032 else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
1033 rcu_preempt_ctrlblk.n_balk_boost_tasks++;
1034 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
1035 rcu_preempt_ctrlblk.n_balk_notyet++;
1036 else
1037 rcu_preempt_ctrlblk.n_balk_nos++;
1038}
1039
1040#endif /* #ifdef CONFIG_RCU_BOOST */
1041
1042static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) 75static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
1043{ 76{
1044 unsigned long flags; 77 unsigned long flags;
@@ -1053,7 +86,6 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
1053 */ 86 */
1054static int show_tiny_stats(struct seq_file *m, void *unused) 87static int show_tiny_stats(struct seq_file *m, void *unused)
1055{ 88{
1056 show_tiny_preempt_stats(m);
1057 seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); 89 seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
1058 seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); 90 seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
1059 return 0; 91 return 0;
@@ -1103,11 +135,40 @@ MODULE_AUTHOR("Paul E. McKenney");
1103MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); 135MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
1104MODULE_LICENSE("GPL"); 136MODULE_LICENSE("GPL");
1105 137
1106static void check_cpu_stall_preempt(void) 138static void check_cpu_stall(struct rcu_ctrlblk *rcp)
1107{ 139{
1108#ifdef CONFIG_TINY_PREEMPT_RCU 140 unsigned long j;
1109 check_cpu_stall(&rcu_preempt_ctrlblk.rcb); 141 unsigned long js;
1110#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 142
143 if (rcu_cpu_stall_suppress)
144 return;
145 rcp->ticks_this_gp++;
146 j = jiffies;
147 js = rcp->jiffies_stall;
148 if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
149 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
150 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
151 jiffies - rcp->gp_start, rcp->qlen);
152 dump_stack();
153 }
154 if (*rcp->curtail && ULONG_CMP_GE(j, js))
155 rcp->jiffies_stall = jiffies +
156 3 * rcu_jiffies_till_stall_check() + 3;
157 else if (ULONG_CMP_GE(j, js))
158 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
159}
160
161static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
162{
163 rcp->ticks_this_gp = 0;
164 rcp->gp_start = jiffies;
165 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
166}
167
168static void check_cpu_stalls(void)
169{
170 RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
171 RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
1111} 172}
1112 173
1113#endif /* #ifdef CONFIG_RCU_TRACE */ 174#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e1f3a8c96724..f4871e52c546 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -695,44 +695,6 @@ static struct rcu_torture_ops srcu_sync_ops = {
695 .name = "srcu_sync" 695 .name = "srcu_sync"
696}; 696};
697 697
698static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
699{
700 return srcu_read_lock_raw(&srcu_ctl);
701}
702
703static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
704{
705 srcu_read_unlock_raw(&srcu_ctl, idx);
706}
707
708static struct rcu_torture_ops srcu_raw_ops = {
709 .init = rcu_sync_torture_init,
710 .readlock = srcu_torture_read_lock_raw,
711 .read_delay = srcu_read_delay,
712 .readunlock = srcu_torture_read_unlock_raw,
713 .completed = srcu_torture_completed,
714 .deferred_free = srcu_torture_deferred_free,
715 .sync = srcu_torture_synchronize,
716 .call = NULL,
717 .cb_barrier = NULL,
718 .stats = srcu_torture_stats,
719 .name = "srcu_raw"
720};
721
722static struct rcu_torture_ops srcu_raw_sync_ops = {
723 .init = rcu_sync_torture_init,
724 .readlock = srcu_torture_read_lock_raw,
725 .read_delay = srcu_read_delay,
726 .readunlock = srcu_torture_read_unlock_raw,
727 .completed = srcu_torture_completed,
728 .deferred_free = rcu_sync_torture_deferred_free,
729 .sync = srcu_torture_synchronize,
730 .call = NULL,
731 .cb_barrier = NULL,
732 .stats = srcu_torture_stats,
733 .name = "srcu_raw_sync"
734};
735
736static void srcu_torture_synchronize_expedited(void) 698static void srcu_torture_synchronize_expedited(void)
737{ 699{
738 synchronize_srcu_expedited(&srcu_ctl); 700 synchronize_srcu_expedited(&srcu_ctl);
@@ -1514,7 +1476,7 @@ rcu_torture_shutdown(void *arg)
1514 * Execute random CPU-hotplug operations at the interval specified 1476 * Execute random CPU-hotplug operations at the interval specified
1515 * by the onoff_interval. 1477 * by the onoff_interval.
1516 */ 1478 */
1517static int __cpuinit 1479static int
1518rcu_torture_onoff(void *arg) 1480rcu_torture_onoff(void *arg)
1519{ 1481{
1520 int cpu; 1482 int cpu;
@@ -1596,7 +1558,7 @@ rcu_torture_onoff(void *arg)
1596 return 0; 1558 return 0;
1597} 1559}
1598 1560
1599static int __cpuinit 1561static int
1600rcu_torture_onoff_init(void) 1562rcu_torture_onoff_init(void)
1601{ 1563{
1602 int ret; 1564 int ret;
@@ -1639,7 +1601,7 @@ static void rcu_torture_onoff_cleanup(void)
1639 * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then 1601 * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
1640 * induces a CPU stall for the time specified by stall_cpu. 1602 * induces a CPU stall for the time specified by stall_cpu.
1641 */ 1603 */
1642static int __cpuinit rcu_torture_stall(void *args) 1604static int rcu_torture_stall(void *args)
1643{ 1605{
1644 unsigned long stop_at; 1606 unsigned long stop_at;
1645 1607
@@ -1983,7 +1945,6 @@ rcu_torture_init(void)
1983 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1945 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1984 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1946 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1985 &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, 1947 &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops,
1986 &srcu_raw_ops, &srcu_raw_sync_ops,
1987 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1948 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1988 1949
1989 mutex_lock(&fullstop_mutex); 1950 mutex_lock(&fullstop_mutex);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 35380019f0fc..068de3a93606 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -218,8 +218,8 @@ module_param(blimit, long, 0444);
218module_param(qhimark, long, 0444); 218module_param(qhimark, long, 0444);
219module_param(qlowmark, long, 0444); 219module_param(qlowmark, long, 0444);
220 220
221static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; 221static ulong jiffies_till_first_fqs = ULONG_MAX;
222static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; 222static ulong jiffies_till_next_fqs = ULONG_MAX;
223 223
224module_param(jiffies_till_first_fqs, ulong, 0644); 224module_param(jiffies_till_first_fqs, ulong, 0644);
225module_param(jiffies_till_next_fqs, ulong, 0644); 225module_param(jiffies_till_next_fqs, ulong, 0644);
@@ -866,7 +866,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
866 * See Documentation/RCU/stallwarn.txt for info on how to debug 866 * See Documentation/RCU/stallwarn.txt for info on how to debug
867 * RCU CPU stall warnings. 867 * RCU CPU stall warnings.
868 */ 868 */
869 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:", 869 pr_err("INFO: %s detected stalls on CPUs/tasks:",
870 rsp->name); 870 rsp->name);
871 print_cpu_stall_info_begin(); 871 print_cpu_stall_info_begin();
872 rcu_for_each_leaf_node(rsp, rnp) { 872 rcu_for_each_leaf_node(rsp, rnp) {
@@ -899,7 +899,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
899 smp_processor_id(), (long)(jiffies - rsp->gp_start), 899 smp_processor_id(), (long)(jiffies - rsp->gp_start),
900 rsp->gpnum, rsp->completed, totqlen); 900 rsp->gpnum, rsp->completed, totqlen);
901 if (ndetected == 0) 901 if (ndetected == 0)
902 printk(KERN_ERR "INFO: Stall ended before state dump start\n"); 902 pr_err("INFO: Stall ended before state dump start\n");
903 else if (!trigger_all_cpu_backtrace()) 903 else if (!trigger_all_cpu_backtrace())
904 rcu_dump_cpu_stacks(rsp); 904 rcu_dump_cpu_stacks(rsp);
905 905
@@ -922,7 +922,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
922 * See Documentation/RCU/stallwarn.txt for info on how to debug 922 * See Documentation/RCU/stallwarn.txt for info on how to debug
923 * RCU CPU stall warnings. 923 * RCU CPU stall warnings.
924 */ 924 */
925 printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name); 925 pr_err("INFO: %s self-detected stall on CPU", rsp->name);
926 print_cpu_stall_info_begin(); 926 print_cpu_stall_info_begin();
927 print_cpu_stall_info(rsp, smp_processor_id()); 927 print_cpu_stall_info(rsp, smp_processor_id());
928 print_cpu_stall_info_end(); 928 print_cpu_stall_info_end();
@@ -985,65 +985,6 @@ void rcu_cpu_stall_reset(void)
985} 985}
986 986
987/* 987/*
988 * Update CPU-local rcu_data state to record the newly noticed grace period.
989 * This is used both when we started the grace period and when we notice
990 * that someone else started the grace period. The caller must hold the
991 * ->lock of the leaf rcu_node structure corresponding to the current CPU,
992 * and must have irqs disabled.
993 */
994static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
995{
996 if (rdp->gpnum != rnp->gpnum) {
997 /*
998 * If the current grace period is waiting for this CPU,
999 * set up to detect a quiescent state, otherwise don't
1000 * go looking for one.
1001 */
1002 rdp->gpnum = rnp->gpnum;
1003 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
1004 rdp->passed_quiesce = 0;
1005 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
1006 zero_cpu_stall_ticks(rdp);
1007 }
1008}
1009
1010static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
1011{
1012 unsigned long flags;
1013 struct rcu_node *rnp;
1014
1015 local_irq_save(flags);
1016 rnp = rdp->mynode;
1017 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
1018 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
1019 local_irq_restore(flags);
1020 return;
1021 }
1022 __note_new_gpnum(rsp, rnp, rdp);
1023 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1024}
1025
1026/*
1027 * Did someone else start a new RCU grace period start since we last
1028 * checked? Update local state appropriately if so. Must be called
1029 * on the CPU corresponding to rdp.
1030 */
1031static int
1032check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
1033{
1034 unsigned long flags;
1035 int ret = 0;
1036
1037 local_irq_save(flags);
1038 if (rdp->gpnum != rsp->gpnum) {
1039 note_new_gpnum(rsp, rdp);
1040 ret = 1;
1041 }
1042 local_irq_restore(flags);
1043 return ret;
1044}
1045
1046/*
1047 * Initialize the specified rcu_data structure's callback list to empty. 988 * Initialize the specified rcu_data structure's callback list to empty.
1048 */ 989 */
1049static void init_callback_list(struct rcu_data *rdp) 990static void init_callback_list(struct rcu_data *rdp)
@@ -1313,18 +1254,16 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1313} 1254}
1314 1255
1315/* 1256/*
1316 * Advance this CPU's callbacks, but only if the current grace period 1257 * Update CPU-local rcu_data state to record the beginnings and ends of
1317 * has ended. This may be called only from the CPU to whom the rdp 1258 * grace periods. The caller must hold the ->lock of the leaf rcu_node
1318 * belongs. In addition, the corresponding leaf rcu_node structure's 1259 * structure corresponding to the current CPU, and must have irqs disabled.
1319 * ->lock must be held by the caller, with irqs disabled.
1320 */ 1260 */
1321static void 1261static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
1322__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
1323{ 1262{
1324 /* Did another grace period end? */ 1263 /* Handle the ends of any preceding grace periods first. */
1325 if (rdp->completed == rnp->completed) { 1264 if (rdp->completed == rnp->completed) {
1326 1265
1327 /* No, so just accelerate recent callbacks. */ 1266 /* No grace period end, so just accelerate recent callbacks. */
1328 rcu_accelerate_cbs(rsp, rnp, rdp); 1267 rcu_accelerate_cbs(rsp, rnp, rdp);
1329 1268
1330 } else { 1269 } else {
@@ -1335,68 +1274,40 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
1335 /* Remember that we saw this grace-period completion. */ 1274 /* Remember that we saw this grace-period completion. */
1336 rdp->completed = rnp->completed; 1275 rdp->completed = rnp->completed;
1337 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); 1276 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
1277 }
1338 1278
1279 if (rdp->gpnum != rnp->gpnum) {
1339 /* 1280 /*
1340 * If we were in an extended quiescent state, we may have 1281 * If the current grace period is waiting for this CPU,
1341 * missed some grace periods that others CPUs handled on 1282 * set up to detect a quiescent state, otherwise don't
1342 * our behalf. Catch up with this state to avoid noting 1283 * go looking for one.
1343 * spurious new grace periods. If another grace period
1344 * has started, then rnp->gpnum will have advanced, so
1345 * we will detect this later on. Of course, any quiescent
1346 * states we found for the old GP are now invalid.
1347 */
1348 if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) {
1349 rdp->gpnum = rdp->completed;
1350 rdp->passed_quiesce = 0;
1351 }
1352
1353 /*
1354 * If RCU does not need a quiescent state from this CPU,
1355 * then make sure that this CPU doesn't go looking for one.
1356 */ 1284 */
1357 if ((rnp->qsmask & rdp->grpmask) == 0) 1285 rdp->gpnum = rnp->gpnum;
1358 rdp->qs_pending = 0; 1286 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
1287 rdp->passed_quiesce = 0;
1288 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
1289 zero_cpu_stall_ticks(rdp);
1359 } 1290 }
1360} 1291}
1361 1292
1362/* 1293static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1363 * Advance this CPU's callbacks, but only if the current grace period
1364 * has ended. This may be called only from the CPU to whom the rdp
1365 * belongs.
1366 */
1367static void
1368rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
1369{ 1294{
1370 unsigned long flags; 1295 unsigned long flags;
1371 struct rcu_node *rnp; 1296 struct rcu_node *rnp;
1372 1297
1373 local_irq_save(flags); 1298 local_irq_save(flags);
1374 rnp = rdp->mynode; 1299 rnp = rdp->mynode;
1375 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ 1300 if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
1301 rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */
1376 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ 1302 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
1377 local_irq_restore(flags); 1303 local_irq_restore(flags);
1378 return; 1304 return;
1379 } 1305 }
1380 __rcu_process_gp_end(rsp, rnp, rdp); 1306 __note_gp_changes(rsp, rnp, rdp);
1381 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1307 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1382} 1308}
1383 1309
1384/* 1310/*
1385 * Do per-CPU grace-period initialization for running CPU. The caller
1386 * must hold the lock of the leaf rcu_node structure corresponding to
1387 * this CPU.
1388 */
1389static void
1390rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
1391{
1392 /* Prior grace period ended, so advance callbacks for current CPU. */
1393 __rcu_process_gp_end(rsp, rnp, rdp);
1394
1395 /* Set state so that this CPU will detect the next quiescent state. */
1396 __note_new_gpnum(rsp, rnp, rdp);
1397}
1398
1399/*
1400 * Initialize a new grace period. 1311 * Initialize a new grace period.
1401 */ 1312 */
1402static int rcu_gp_init(struct rcu_state *rsp) 1313static int rcu_gp_init(struct rcu_state *rsp)
@@ -1444,7 +1355,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1444 WARN_ON_ONCE(rnp->completed != rsp->completed); 1355 WARN_ON_ONCE(rnp->completed != rsp->completed);
1445 ACCESS_ONCE(rnp->completed) = rsp->completed; 1356 ACCESS_ONCE(rnp->completed) = rsp->completed;
1446 if (rnp == rdp->mynode) 1357 if (rnp == rdp->mynode)
1447 rcu_start_gp_per_cpu(rsp, rnp, rdp); 1358 __note_gp_changes(rsp, rnp, rdp);
1448 rcu_preempt_boost_start_gp(rnp); 1359 rcu_preempt_boost_start_gp(rnp);
1449 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 1360 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
1450 rnp->level, rnp->grplo, 1361 rnp->level, rnp->grplo,
@@ -1527,7 +1438,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1527 ACCESS_ONCE(rnp->completed) = rsp->gpnum; 1438 ACCESS_ONCE(rnp->completed) = rsp->gpnum;
1528 rdp = this_cpu_ptr(rsp->rda); 1439 rdp = this_cpu_ptr(rsp->rda);
1529 if (rnp == rdp->mynode) 1440 if (rnp == rdp->mynode)
1530 __rcu_process_gp_end(rsp, rnp, rdp); 1441 __note_gp_changes(rsp, rnp, rdp);
1531 nocb += rcu_future_gp_cleanup(rsp, rnp); 1442 nocb += rcu_future_gp_cleanup(rsp, rnp);
1532 raw_spin_unlock_irq(&rnp->lock); 1443 raw_spin_unlock_irq(&rnp->lock);
1533 cond_resched(); 1444 cond_resched();
@@ -1805,9 +1716,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1805static void 1716static void
1806rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) 1717rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1807{ 1718{
1808 /* If there is now a new grace period, record and return. */ 1719 /* Check for grace-period ends and beginnings. */
1809 if (check_for_new_grace_period(rsp, rdp)) 1720 note_gp_changes(rsp, rdp);
1810 return;
1811 1721
1812 /* 1722 /*
1813 * Does this CPU still need to do its part for current grace period? 1723 * Does this CPU still need to do its part for current grace period?
@@ -2271,9 +2181,6 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2271 2181
2272 WARN_ON_ONCE(rdp->beenonline == 0); 2182 WARN_ON_ONCE(rdp->beenonline == 0);
2273 2183
2274 /* Handle the end of a grace period that some other CPU ended. */
2275 rcu_process_gp_end(rsp, rdp);
2276
2277 /* Update RCU state based on any recent quiescent states. */ 2184 /* Update RCU state based on any recent quiescent states. */
2278 rcu_check_quiescent_state(rsp, rdp); 2185 rcu_check_quiescent_state(rsp, rdp);
2279 2186
@@ -2358,8 +2265,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2358 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 2265 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
2359 2266
2360 /* Are we ignoring a completed grace period? */ 2267 /* Are we ignoring a completed grace period? */
2361 rcu_process_gp_end(rsp, rdp); 2268 note_gp_changes(rsp, rdp);
2362 check_for_new_grace_period(rsp, rdp);
2363 2269
2364 /* Start a new grace period if one not already started. */ 2270 /* Start a new grace period if one not already started. */
2365 if (!rcu_gp_in_progress(rsp)) { 2271 if (!rcu_gp_in_progress(rsp)) {
@@ -3004,7 +2910,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
3004 * can accept some slop in the rsp->completed access due to the fact 2910 * can accept some slop in the rsp->completed access due to the fact
3005 * that this CPU cannot possibly have any RCU callbacks in flight yet. 2911 * that this CPU cannot possibly have any RCU callbacks in flight yet.
3006 */ 2912 */
3007static void __cpuinit 2913static void
3008rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) 2914rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
3009{ 2915{
3010 unsigned long flags; 2916 unsigned long flags;
@@ -3056,7 +2962,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
3056 mutex_unlock(&rsp->onoff_mutex); 2962 mutex_unlock(&rsp->onoff_mutex);
3057} 2963}
3058 2964
3059static void __cpuinit rcu_prepare_cpu(int cpu) 2965static void rcu_prepare_cpu(int cpu)
3060{ 2966{
3061 struct rcu_state *rsp; 2967 struct rcu_state *rsp;
3062 2968
@@ -3068,7 +2974,7 @@ static void __cpuinit rcu_prepare_cpu(int cpu)
3068/* 2974/*
3069 * Handle CPU online/offline notification events. 2975 * Handle CPU online/offline notification events.
3070 */ 2976 */
3071static int __cpuinit rcu_cpu_notify(struct notifier_block *self, 2977static int rcu_cpu_notify(struct notifier_block *self,
3072 unsigned long action, void *hcpu) 2978 unsigned long action, void *hcpu)
3073{ 2979{
3074 long cpu = (long)hcpu; 2980 long cpu = (long)hcpu;
@@ -3120,7 +3026,7 @@ static int __init rcu_spawn_gp_kthread(void)
3120 struct task_struct *t; 3026 struct task_struct *t;
3121 3027
3122 for_each_rcu_flavor(rsp) { 3028 for_each_rcu_flavor(rsp) {
3123 t = kthread_run(rcu_gp_kthread, rsp, rsp->name); 3029 t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
3124 BUG_ON(IS_ERR(t)); 3030 BUG_ON(IS_ERR(t));
3125 rnp = rcu_get_root(rsp); 3031 rnp = rcu_get_root(rsp);
3126 raw_spin_lock_irqsave(&rnp->lock, flags); 3032 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -3265,11 +3171,25 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3265 */ 3171 */
3266static void __init rcu_init_geometry(void) 3172static void __init rcu_init_geometry(void)
3267{ 3173{
3174 ulong d;
3268 int i; 3175 int i;
3269 int j; 3176 int j;
3270 int n = nr_cpu_ids; 3177 int n = nr_cpu_ids;
3271 int rcu_capacity[MAX_RCU_LVLS + 1]; 3178 int rcu_capacity[MAX_RCU_LVLS + 1];
3272 3179
3180 /*
3181 * Initialize any unspecified boot parameters.
3182 * The default values of jiffies_till_first_fqs and
3183 * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS
3184 * value, which is a function of HZ, then adding one for each
3185 * RCU_JIFFIES_FQS_DIV CPUs that might be on the system.
3186 */
3187 d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
3188 if (jiffies_till_first_fqs == ULONG_MAX)
3189 jiffies_till_first_fqs = d;
3190 if (jiffies_till_next_fqs == ULONG_MAX)
3191 jiffies_till_next_fqs = d;
3192
3273 /* If the compile-time values are accurate, just leave. */ 3193 /* If the compile-time values are accurate, just leave. */
3274 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && 3194 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
3275 nr_cpu_ids == NR_CPUS) 3195 nr_cpu_ids == NR_CPUS)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4df503470e42..b3832581043c 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -343,12 +343,17 @@ struct rcu_data {
343#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ 343#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
344#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 344#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
345 345
346#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 346#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
347 /* For jiffies_till_first_fqs and */
348 /* and jiffies_till_next_fqs. */
347 349
348#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 350#define RCU_JIFFIES_FQS_DIV 256 /* Very large systems need more */
349 /* to take at least one */ 351 /* delay between bouts of */
350 /* scheduling clock irq */ 352 /* quiescent-state forcing. */
351 /* before ratting on them. */ 353
354#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time to take */
355 /* at least one scheduling clock */
356 /* irq before ratting on them. */
352 357
353#define rcu_wait(cond) \ 358#define rcu_wait(cond) \
354do { \ 359do { \
@@ -516,10 +521,10 @@ static void invoke_rcu_callbacks_kthread(void);
516static bool rcu_is_callbacks_kthread(void); 521static bool rcu_is_callbacks_kthread(void);
517#ifdef CONFIG_RCU_BOOST 522#ifdef CONFIG_RCU_BOOST
518static void rcu_preempt_do_callbacks(void); 523static void rcu_preempt_do_callbacks(void);
519static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 524static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
520 struct rcu_node *rnp); 525 struct rcu_node *rnp);
521#endif /* #ifdef CONFIG_RCU_BOOST */ 526#endif /* #ifdef CONFIG_RCU_BOOST */
522static void __cpuinit rcu_prepare_kthreads(int cpu); 527static void rcu_prepare_kthreads(int cpu);
523static void rcu_cleanup_after_idle(int cpu); 528static void rcu_cleanup_after_idle(int cpu);
524static void rcu_prepare_for_idle(int cpu); 529static void rcu_prepare_for_idle(int cpu);
525static void rcu_idle_count_callbacks_posted(void); 530static void rcu_idle_count_callbacks_posted(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3db5a375d8dd..769e12e3151b 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -53,38 +53,37 @@ static char __initdata nocb_buf[NR_CPUS * 5];
53static void __init rcu_bootup_announce_oddness(void) 53static void __init rcu_bootup_announce_oddness(void)
54{ 54{
55#ifdef CONFIG_RCU_TRACE 55#ifdef CONFIG_RCU_TRACE
56 printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n"); 56 pr_info("\tRCU debugfs-based tracing is enabled.\n");
57#endif 57#endif
58#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) 58#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
59 printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n", 59 pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
60 CONFIG_RCU_FANOUT); 60 CONFIG_RCU_FANOUT);
61#endif 61#endif
62#ifdef CONFIG_RCU_FANOUT_EXACT 62#ifdef CONFIG_RCU_FANOUT_EXACT
63 printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n"); 63 pr_info("\tHierarchical RCU autobalancing is disabled.\n");
64#endif 64#endif
65#ifdef CONFIG_RCU_FAST_NO_HZ 65#ifdef CONFIG_RCU_FAST_NO_HZ
66 printk(KERN_INFO 66 pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
67 "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
68#endif 67#endif
69#ifdef CONFIG_PROVE_RCU 68#ifdef CONFIG_PROVE_RCU
70 printk(KERN_INFO "\tRCU lockdep checking is enabled.\n"); 69 pr_info("\tRCU lockdep checking is enabled.\n");
71#endif 70#endif
72#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE 71#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
73 printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); 72 pr_info("\tRCU torture testing starts during boot.\n");
74#endif 73#endif
75#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) 74#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
76 printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n"); 75 pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n");
77#endif 76#endif
78#if defined(CONFIG_RCU_CPU_STALL_INFO) 77#if defined(CONFIG_RCU_CPU_STALL_INFO)
79 printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); 78 pr_info("\tAdditional per-CPU info printed with stalls.\n");
80#endif 79#endif
81#if NUM_RCU_LVL_4 != 0 80#if NUM_RCU_LVL_4 != 0
82 printk(KERN_INFO "\tFour-level hierarchy is enabled.\n"); 81 pr_info("\tFour-level hierarchy is enabled.\n");
83#endif 82#endif
84 if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) 83 if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
85 printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); 84 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
86 if (nr_cpu_ids != NR_CPUS) 85 if (nr_cpu_ids != NR_CPUS)
87 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 86 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
88#ifdef CONFIG_RCU_NOCB_CPU 87#ifdef CONFIG_RCU_NOCB_CPU
89#ifndef CONFIG_RCU_NOCB_CPU_NONE 88#ifndef CONFIG_RCU_NOCB_CPU_NONE
90 if (!have_rcu_nocb_mask) { 89 if (!have_rcu_nocb_mask) {
@@ -92,19 +91,19 @@ static void __init rcu_bootup_announce_oddness(void)
92 have_rcu_nocb_mask = true; 91 have_rcu_nocb_mask = true;
93 } 92 }
94#ifdef CONFIG_RCU_NOCB_CPU_ZERO 93#ifdef CONFIG_RCU_NOCB_CPU_ZERO
95 pr_info("\tExperimental no-CBs CPU 0\n"); 94 pr_info("\tOffload RCU callbacks from CPU 0\n");
96 cpumask_set_cpu(0, rcu_nocb_mask); 95 cpumask_set_cpu(0, rcu_nocb_mask);
97#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ 96#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
98#ifdef CONFIG_RCU_NOCB_CPU_ALL 97#ifdef CONFIG_RCU_NOCB_CPU_ALL
99 pr_info("\tExperimental no-CBs for all CPUs\n"); 98 pr_info("\tOffload RCU callbacks from all CPUs\n");
100 cpumask_setall(rcu_nocb_mask); 99 cpumask_setall(rcu_nocb_mask);
101#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ 100#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
102#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ 101#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
103 if (have_rcu_nocb_mask) { 102 if (have_rcu_nocb_mask) {
104 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); 103 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
105 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); 104 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
106 if (rcu_nocb_poll) 105 if (rcu_nocb_poll)
107 pr_info("\tExperimental polled no-CBs CPUs.\n"); 106 pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
108 } 107 }
109#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 108#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
110} 109}
@@ -123,7 +122,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
123 */ 122 */
124static void __init rcu_bootup_announce(void) 123static void __init rcu_bootup_announce(void)
125{ 124{
126 printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n"); 125 pr_info("Preemptible hierarchical RCU implementation.\n");
127 rcu_bootup_announce_oddness(); 126 rcu_bootup_announce_oddness();
128} 127}
129 128
@@ -490,13 +489,13 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
490 489
491static void rcu_print_task_stall_begin(struct rcu_node *rnp) 490static void rcu_print_task_stall_begin(struct rcu_node *rnp)
492{ 491{
493 printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", 492 pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
494 rnp->level, rnp->grplo, rnp->grphi); 493 rnp->level, rnp->grplo, rnp->grphi);
495} 494}
496 495
497static void rcu_print_task_stall_end(void) 496static void rcu_print_task_stall_end(void)
498{ 497{
499 printk(KERN_CONT "\n"); 498 pr_cont("\n");
500} 499}
501 500
502#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ 501#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
@@ -526,7 +525,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
526 t = list_entry(rnp->gp_tasks, 525 t = list_entry(rnp->gp_tasks,
527 struct task_struct, rcu_node_entry); 526 struct task_struct, rcu_node_entry);
528 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { 527 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
529 printk(KERN_CONT " P%d", t->pid); 528 pr_cont(" P%d", t->pid);
530 ndetected++; 529 ndetected++;
531 } 530 }
532 rcu_print_task_stall_end(); 531 rcu_print_task_stall_end();
@@ -933,6 +932,24 @@ static void __init __rcu_init_preempt(void)
933 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); 932 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
934} 933}
935 934
935/*
936 * Check for a task exiting while in a preemptible-RCU read-side
937 * critical section, clean up if so. No need to issue warnings,
938 * as debug_check_no_locks_held() already does this if lockdep
939 * is enabled.
940 */
941void exit_rcu(void)
942{
943 struct task_struct *t = current;
944
945 if (likely(list_empty(&current->rcu_node_entry)))
946 return;
947 t->rcu_read_lock_nesting = 1;
948 barrier();
949 t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
950 __rcu_read_unlock();
951}
952
936#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 953#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
937 954
938static struct rcu_state *rcu_state = &rcu_sched_state; 955static struct rcu_state *rcu_state = &rcu_sched_state;
@@ -942,7 +959,7 @@ static struct rcu_state *rcu_state = &rcu_sched_state;
942 */ 959 */
943static void __init rcu_bootup_announce(void) 960static void __init rcu_bootup_announce(void)
944{ 961{
945 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 962 pr_info("Hierarchical RCU implementation.\n");
946 rcu_bootup_announce_oddness(); 963 rcu_bootup_announce_oddness();
947} 964}
948 965
@@ -1101,6 +1118,14 @@ static void __init __rcu_init_preempt(void)
1101{ 1118{
1102} 1119}
1103 1120
1121/*
1122 * Because preemptible RCU does not exist, tasks cannot possibly exit
1123 * while in preemptible RCU read-side critical sections.
1124 */
1125void exit_rcu(void)
1126{
1127}
1128
1104#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1129#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1105 1130
1106#ifdef CONFIG_RCU_BOOST 1131#ifdef CONFIG_RCU_BOOST
@@ -1327,7 +1352,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1327 * already exist. We only create this kthread for preemptible RCU. 1352 * already exist. We only create this kthread for preemptible RCU.
1328 * Returns zero if all is well, a negated errno otherwise. 1353 * Returns zero if all is well, a negated errno otherwise.
1329 */ 1354 */
1330static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 1355static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1331 struct rcu_node *rnp) 1356 struct rcu_node *rnp)
1332{ 1357{
1333 int rnp_index = rnp - &rsp->node[0]; 1358 int rnp_index = rnp - &rsp->node[0];
@@ -1482,7 +1507,7 @@ static int __init rcu_spawn_kthreads(void)
1482} 1507}
1483early_initcall(rcu_spawn_kthreads); 1508early_initcall(rcu_spawn_kthreads);
1484 1509
1485static void __cpuinit rcu_prepare_kthreads(int cpu) 1510static void rcu_prepare_kthreads(int cpu)
1486{ 1511{
1487 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 1512 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
1488 struct rcu_node *rnp = rdp->mynode; 1513 struct rcu_node *rnp = rdp->mynode;
@@ -1524,7 +1549,7 @@ static int __init rcu_scheduler_really_started(void)
1524} 1549}
1525early_initcall(rcu_scheduler_really_started); 1550early_initcall(rcu_scheduler_really_started);
1526 1551
1527static void __cpuinit rcu_prepare_kthreads(int cpu) 1552static void rcu_prepare_kthreads(int cpu)
1528{ 1553{
1529} 1554}
1530 1555
@@ -1629,7 +1654,7 @@ static bool rcu_try_advance_all_cbs(void)
1629 */ 1654 */
1630 if (rdp->completed != rnp->completed && 1655 if (rdp->completed != rnp->completed &&
1631 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) 1656 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
1632 rcu_process_gp_end(rsp, rdp); 1657 note_gp_changes(rsp, rdp);
1633 1658
1634 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1659 if (cpu_has_callbacks_ready_to_invoke(rdp))
1635 cbs_ready = true; 1660 cbs_ready = true;
@@ -1883,7 +1908,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
1883/* Initiate the stall-info list. */ 1908/* Initiate the stall-info list. */
1884static void print_cpu_stall_info_begin(void) 1909static void print_cpu_stall_info_begin(void)
1885{ 1910{
1886 printk(KERN_CONT "\n"); 1911 pr_cont("\n");
1887} 1912}
1888 1913
1889/* 1914/*
@@ -1914,7 +1939,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1914 ticks_value = rsp->gpnum - rdp->gpnum; 1939 ticks_value = rsp->gpnum - rdp->gpnum;
1915 } 1940 }
1916 print_cpu_stall_fast_no_hz(fast_no_hz, cpu); 1941 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
1917 printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", 1942 pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
1918 cpu, ticks_value, ticks_title, 1943 cpu, ticks_value, ticks_title,
1919 atomic_read(&rdtp->dynticks) & 0xfff, 1944 atomic_read(&rdtp->dynticks) & 0xfff,
1920 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, 1945 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
@@ -1925,7 +1950,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1925/* Terminate the stall-info list. */ 1950/* Terminate the stall-info list. */
1926static void print_cpu_stall_info_end(void) 1951static void print_cpu_stall_info_end(void)
1927{ 1952{
1928 printk(KERN_ERR "\t"); 1953 pr_err("\t");
1929} 1954}
1930 1955
1931/* Zero ->ticks_this_gp for all flavors of RCU. */ 1956/* Zero ->ticks_this_gp for all flavors of RCU. */
@@ -1948,17 +1973,17 @@ static void increment_cpu_stall_ticks(void)
1948 1973
1949static void print_cpu_stall_info_begin(void) 1974static void print_cpu_stall_info_begin(void)
1950{ 1975{
1951 printk(KERN_CONT " {"); 1976 pr_cont(" {");
1952} 1977}
1953 1978
1954static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) 1979static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1955{ 1980{
1956 printk(KERN_CONT " %d", cpu); 1981 pr_cont(" %d", cpu);
1957} 1982}
1958 1983
1959static void print_cpu_stall_info_end(void) 1984static void print_cpu_stall_info_end(void)
1960{ 1985{
1961 printk(KERN_CONT "} "); 1986 pr_cont("} ");
1962} 1987}
1963 1988
1964static void zero_cpu_stall_ticks(struct rcu_data *rdp) 1989static void zero_cpu_stall_ticks(struct rcu_data *rdp)
diff --git a/kernel/reboot.c b/kernel/reboot.c
new file mode 100644
index 000000000000..269ed9384cc4
--- /dev/null
+++ b/kernel/reboot.c
@@ -0,0 +1,419 @@
1/*
2 * linux/kernel/reboot.c
3 *
4 * Copyright (C) 2013 Linus Torvalds
5 */
6
7#define pr_fmt(fmt) "reboot: " fmt
8
9#include <linux/ctype.h>
10#include <linux/export.h>
11#include <linux/kexec.h>
12#include <linux/kmod.h>
13#include <linux/kmsg_dump.h>
14#include <linux/reboot.h>
15#include <linux/suspend.h>
16#include <linux/syscalls.h>
17#include <linux/syscore_ops.h>
18#include <linux/uaccess.h>
19
20/*
21 * this indicates whether you can reboot with ctrl-alt-del: the default is yes
22 */
23
24int C_A_D = 1;
25struct pid *cad_pid;
26EXPORT_SYMBOL(cad_pid);
27
28#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32)
29#define DEFAULT_REBOOT_MODE = REBOOT_HARD
30#else
31#define DEFAULT_REBOOT_MODE
32#endif
33enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
34
35int reboot_default;
36int reboot_cpu;
37enum reboot_type reboot_type = BOOT_ACPI;
38int reboot_force;
39
40/*
41 * If set, this is used for preparing the system to power off.
42 */
43
44void (*pm_power_off_prepare)(void);
45
46/**
47 * emergency_restart - reboot the system
48 *
49 * Without shutting down any hardware or taking any locks
50 * reboot the system. This is called when we know we are in
51 * trouble so this is our best effort to reboot. This is
52 * safe to call in interrupt context.
53 */
54void emergency_restart(void)
55{
56 kmsg_dump(KMSG_DUMP_EMERG);
57 machine_emergency_restart();
58}
59EXPORT_SYMBOL_GPL(emergency_restart);
60
61void kernel_restart_prepare(char *cmd)
62{
63 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
64 system_state = SYSTEM_RESTART;
65 usermodehelper_disable();
66 device_shutdown();
67}
68
69/**
70 * register_reboot_notifier - Register function to be called at reboot time
71 * @nb: Info about notifier function to be called
72 *
73 * Registers a function with the list of functions
74 * to be called at reboot time.
75 *
76 * Currently always returns zero, as blocking_notifier_chain_register()
77 * always returns zero.
78 */
79int register_reboot_notifier(struct notifier_block *nb)
80{
81 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
82}
83EXPORT_SYMBOL(register_reboot_notifier);
84
85/**
86 * unregister_reboot_notifier - Unregister previously registered reboot notifier
87 * @nb: Hook to be unregistered
88 *
89 * Unregisters a previously registered reboot
90 * notifier function.
91 *
92 * Returns zero on success, or %-ENOENT on failure.
93 */
94int unregister_reboot_notifier(struct notifier_block *nb)
95{
96 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
97}
98EXPORT_SYMBOL(unregister_reboot_notifier);
99
100static void migrate_to_reboot_cpu(void)
101{
102 /* The boot cpu is always logical cpu 0 */
103 int cpu = reboot_cpu;
104
105 cpu_hotplug_disable();
106
107 /* Make certain the cpu I'm about to reboot on is online */
108 if (!cpu_online(cpu))
109 cpu = cpumask_first(cpu_online_mask);
110
111 /* Prevent races with other tasks migrating this task */
112 current->flags |= PF_NO_SETAFFINITY;
113
114 /* Make certain I only run on the appropriate processor */
115 set_cpus_allowed_ptr(current, cpumask_of(cpu));
116}
117
118/**
119 * kernel_restart - reboot the system
120 * @cmd: pointer to buffer containing command to execute for restart
121 * or %NULL
122 *
123 * Shutdown everything and perform a clean reboot.
124 * This is not safe to call in interrupt context.
125 */
126void kernel_restart(char *cmd)
127{
128 kernel_restart_prepare(cmd);
129 migrate_to_reboot_cpu();
130 syscore_shutdown();
131 if (!cmd)
132 pr_emerg("Restarting system\n");
133 else
134 pr_emerg("Restarting system with command '%s'\n", cmd);
135 kmsg_dump(KMSG_DUMP_RESTART);
136 machine_restart(cmd);
137}
138EXPORT_SYMBOL_GPL(kernel_restart);
139
140static void kernel_shutdown_prepare(enum system_states state)
141{
142 blocking_notifier_call_chain(&reboot_notifier_list,
143 (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL);
144 system_state = state;
145 usermodehelper_disable();
146 device_shutdown();
147}
148/**
149 * kernel_halt - halt the system
150 *
151 * Shutdown everything and perform a clean system halt.
152 */
153void kernel_halt(void)
154{
155 kernel_shutdown_prepare(SYSTEM_HALT);
156 migrate_to_reboot_cpu();
157 syscore_shutdown();
158 pr_emerg("System halted\n");
159 kmsg_dump(KMSG_DUMP_HALT);
160 machine_halt();
161}
162EXPORT_SYMBOL_GPL(kernel_halt);
163
164/**
165 * kernel_power_off - power_off the system
166 *
167 * Shutdown everything and perform a clean system power_off.
168 */
169void kernel_power_off(void)
170{
171 kernel_shutdown_prepare(SYSTEM_POWER_OFF);
172 if (pm_power_off_prepare)
173 pm_power_off_prepare();
174 migrate_to_reboot_cpu();
175 syscore_shutdown();
176 pr_emerg("Power down\n");
177 kmsg_dump(KMSG_DUMP_POWEROFF);
178 machine_power_off();
179}
180EXPORT_SYMBOL_GPL(kernel_power_off);
181
182static DEFINE_MUTEX(reboot_mutex);
183
184/*
185 * Reboot system call: for obvious reasons only root may call it,
186 * and even root needs to set up some magic numbers in the registers
187 * so that some mistake won't make this reboot the whole machine.
188 * You can also set the meaning of the ctrl-alt-del-key here.
189 *
190 * reboot doesn't sync: do that yourself before calling this.
191 */
192SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
193 void __user *, arg)
194{
195 struct pid_namespace *pid_ns = task_active_pid_ns(current);
196 char buffer[256];
197 int ret = 0;
198
199 /* We only trust the superuser with rebooting the system. */
200 if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
201 return -EPERM;
202
203 /* For safety, we require "magic" arguments. */
204 if (magic1 != LINUX_REBOOT_MAGIC1 ||
205 (magic2 != LINUX_REBOOT_MAGIC2 &&
206 magic2 != LINUX_REBOOT_MAGIC2A &&
207 magic2 != LINUX_REBOOT_MAGIC2B &&
208 magic2 != LINUX_REBOOT_MAGIC2C))
209 return -EINVAL;
210
211 /*
212 * If pid namespaces are enabled and the current task is in a child
213 * pid_namespace, the command is handled by reboot_pid_ns() which will
214 * call do_exit().
215 */
216 ret = reboot_pid_ns(pid_ns, cmd);
217 if (ret)
218 return ret;
219
220 /* Instead of trying to make the power_off code look like
221 * halt when pm_power_off is not set do it the easy way.
222 */
223 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
224 cmd = LINUX_REBOOT_CMD_HALT;
225
226 mutex_lock(&reboot_mutex);
227 switch (cmd) {
228 case LINUX_REBOOT_CMD_RESTART:
229 kernel_restart(NULL);
230 break;
231
232 case LINUX_REBOOT_CMD_CAD_ON:
233 C_A_D = 1;
234 break;
235
236 case LINUX_REBOOT_CMD_CAD_OFF:
237 C_A_D = 0;
238 break;
239
240 case LINUX_REBOOT_CMD_HALT:
241 kernel_halt();
242 do_exit(0);
243 panic("cannot halt");
244
245 case LINUX_REBOOT_CMD_POWER_OFF:
246 kernel_power_off();
247 do_exit(0);
248 break;
249
250 case LINUX_REBOOT_CMD_RESTART2:
251 ret = strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1);
252 if (ret < 0) {
253 ret = -EFAULT;
254 break;
255 }
256 buffer[sizeof(buffer) - 1] = '\0';
257
258 kernel_restart(buffer);
259 break;
260
261#ifdef CONFIG_KEXEC
262 case LINUX_REBOOT_CMD_KEXEC:
263 ret = kernel_kexec();
264 break;
265#endif
266
267#ifdef CONFIG_HIBERNATION
268 case LINUX_REBOOT_CMD_SW_SUSPEND:
269 ret = hibernate();
270 break;
271#endif
272
273 default:
274 ret = -EINVAL;
275 break;
276 }
277 mutex_unlock(&reboot_mutex);
278 return ret;
279}
280
281static void deferred_cad(struct work_struct *dummy)
282{
283 kernel_restart(NULL);
284}
285
286/*
287 * This function gets called by ctrl-alt-del - ie the keyboard interrupt.
288 * As it's called within an interrupt, it may NOT sync: the only choice
289 * is whether to reboot at once, or just ignore the ctrl-alt-del.
290 */
291void ctrl_alt_del(void)
292{
293 static DECLARE_WORK(cad_work, deferred_cad);
294
295 if (C_A_D)
296 schedule_work(&cad_work);
297 else
298 kill_cad_pid(SIGINT, 1);
299}
300
301char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
302
303static int __orderly_poweroff(bool force)
304{
305 char **argv;
306 static char *envp[] = {
307 "HOME=/",
308 "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
309 NULL
310 };
311 int ret;
312
313 argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
314 if (argv) {
315 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
316 argv_free(argv);
317 } else {
318 ret = -ENOMEM;
319 }
320
321 if (ret && force) {
322 pr_warn("Failed to start orderly shutdown: forcing the issue\n");
323 /*
324 * I guess this should try to kick off some daemon to sync and
325 * poweroff asap. Or not even bother syncing if we're doing an
326 * emergency shutdown?
327 */
328 emergency_sync();
329 kernel_power_off();
330 }
331
332 return ret;
333}
334
335static bool poweroff_force;
336
337static void poweroff_work_func(struct work_struct *work)
338{
339 __orderly_poweroff(poweroff_force);
340}
341
342static DECLARE_WORK(poweroff_work, poweroff_work_func);
343
344/**
345 * orderly_poweroff - Trigger an orderly system poweroff
346 * @force: force poweroff if command execution fails
347 *
348 * This may be called from any context to trigger a system shutdown.
349 * If the orderly shutdown fails, it will force an immediate shutdown.
350 */
351int orderly_poweroff(bool force)
352{
353 if (force) /* do not override the pending "true" */
354 poweroff_force = true;
355 schedule_work(&poweroff_work);
356 return 0;
357}
358EXPORT_SYMBOL_GPL(orderly_poweroff);
359
360static int __init reboot_setup(char *str)
361{
362 for (;;) {
363 /*
364 * Having anything passed on the command line via
365 * reboot= will cause us to disable DMI checking
366 * below.
367 */
368 reboot_default = 0;
369
370 switch (*str) {
371 case 'w':
372 reboot_mode = REBOOT_WARM;
373 break;
374
375 case 'c':
376 reboot_mode = REBOOT_COLD;
377 break;
378
379 case 'h':
380 reboot_mode = REBOOT_HARD;
381 break;
382
383 case 's':
384 if (isdigit(*(str+1)))
385 reboot_cpu = simple_strtoul(str+1, NULL, 0);
386 else if (str[1] == 'm' && str[2] == 'p' &&
387 isdigit(*(str+3)))
388 reboot_cpu = simple_strtoul(str+3, NULL, 0);
389 else
390 reboot_mode = REBOOT_SOFT;
391 break;
392
393 case 'g':
394 reboot_mode = REBOOT_GPIO;
395 break;
396
397 case 'b':
398 case 'a':
399 case 'k':
400 case 't':
401 case 'e':
402 case 'p':
403 reboot_type = *str;
404 break;
405
406 case 'f':
407 reboot_force = 1;
408 break;
409 }
410
411 str = strchr(str, ',');
412 if (str)
413 str++;
414 else
415 break;
416 }
417 return 1;
418}
419__setup("reboot=", reboot_setup);
diff --git a/kernel/relay.c b/kernel/relay.c
index b91488ba2e5a..5001c9887db1 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -516,7 +516,7 @@ static void setup_callbacks(struct rchan *chan,
516 * 516 *
517 * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) 517 * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
518 */ 518 */
519static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, 519static int relay_hotcpu_callback(struct notifier_block *nb,
520 unsigned long action, 520 unsigned long action,
521 void *hcpu) 521 void *hcpu)
522{ 522{
diff --git a/kernel/resource.c b/kernel/resource.c
index d7386986e10e..3f285dce9347 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -409,6 +409,7 @@ int __weak page_is_ram(unsigned long pfn)
409{ 409{
410 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; 410 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
411} 411}
412EXPORT_SYMBOL_GPL(page_is_ram);
412 413
413void __weak arch_remove_reservations(struct resource *avail) 414void __weak arch_remove_reservations(struct resource *avail)
414{ 415{
@@ -448,7 +449,6 @@ static int __find_resource(struct resource *root, struct resource *old,
448 struct resource *this = root->child; 449 struct resource *this = root->child;
449 struct resource tmp = *new, avail, alloc; 450 struct resource tmp = *new, avail, alloc;
450 451
451 tmp.flags = new->flags;
452 tmp.start = root->start; 452 tmp.start = root->start;
453 /* 453 /*
454 * Skip past an allocated resource that starts at 0, since the assignment 454 * Skip past an allocated resource that starts at 0, since the assignment
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 1e09308bf2a1..0dd6aec1cb6a 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -145,6 +145,19 @@ int max_lock_depth = 1024;
145/* 145/*
146 * Adjust the priority chain. Also used for deadlock detection. 146 * Adjust the priority chain. Also used for deadlock detection.
147 * Decreases task's usage by one - may thus free the task. 147 * Decreases task's usage by one - may thus free the task.
148 *
149 * @task: the task owning the mutex (owner) for which a chain walk is probably
150 * needed
151 * @deadlock_detect: do we have to carry out deadlock detection?
152 * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
153 * things for a task that has just got its priority adjusted, and
154 * is waiting on a mutex)
155 * @orig_waiter: rt_mutex_waiter struct for the task that has just donated
156 * its priority to the mutex owner (can be NULL in the case
157 * depicted above or if the top waiter is gone away and we are
158 * actually deboosting the owner)
159 * @top_task: the current top waiter
160 *
148 * Returns 0 or -EDEADLK. 161 * Returns 0 or -EDEADLK.
149 */ 162 */
150static int rt_mutex_adjust_prio_chain(struct task_struct *task, 163static int rt_mutex_adjust_prio_chain(struct task_struct *task,
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index deaf90e4a1de..54adcf35f495 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer 11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o 14obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
15obj-$(CONFIG_SMP) += cpupri.o 15obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 17obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 64de5f8b0c9e..4a073539c58e 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -77,8 +77,6 @@ static inline struct autogroup *autogroup_create(void)
77 if (IS_ERR(tg)) 77 if (IS_ERR(tg))
78 goto out_free; 78 goto out_free;
79 79
80 sched_online_group(tg, &root_task_group);
81
82 kref_init(&ag->kref); 80 kref_init(&ag->kref);
83 init_rwsem(&ag->lock); 81 init_rwsem(&ag->lock);
84 ag->id = atomic_inc_return(&autogroup_seq_nr); 82 ag->id = atomic_inc_return(&autogroup_seq_nr);
@@ -98,6 +96,7 @@ static inline struct autogroup *autogroup_create(void)
98#endif 96#endif
99 tg->autogroup = ag; 97 tg->autogroup = ag;
100 98
99 sched_online_group(tg, &root_task_group);
101 return ag; 100 return ag;
102 101
103out_free: 102out_free:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e8b335016c52..05c39f030314 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -370,13 +370,6 @@ static struct rq *this_rq_lock(void)
370#ifdef CONFIG_SCHED_HRTICK 370#ifdef CONFIG_SCHED_HRTICK
371/* 371/*
372 * Use HR-timers to deliver accurate preemption points. 372 * Use HR-timers to deliver accurate preemption points.
373 *
374 * Its all a bit involved since we cannot program an hrt while holding the
375 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
376 * reschedule event.
377 *
378 * When we get rescheduled we reprogram the hrtick_timer outside of the
379 * rq->lock.
380 */ 373 */
381 374
382static void hrtick_clear(struct rq *rq) 375static void hrtick_clear(struct rq *rq)
@@ -404,6 +397,15 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
404} 397}
405 398
406#ifdef CONFIG_SMP 399#ifdef CONFIG_SMP
400
401static int __hrtick_restart(struct rq *rq)
402{
403 struct hrtimer *timer = &rq->hrtick_timer;
404 ktime_t time = hrtimer_get_softexpires(timer);
405
406 return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
407}
408
407/* 409/*
408 * called from hardirq (IPI) context 410 * called from hardirq (IPI) context
409 */ 411 */
@@ -412,7 +414,7 @@ static void __hrtick_start(void *arg)
412 struct rq *rq = arg; 414 struct rq *rq = arg;
413 415
414 raw_spin_lock(&rq->lock); 416 raw_spin_lock(&rq->lock);
415 hrtimer_restart(&rq->hrtick_timer); 417 __hrtick_restart(rq);
416 rq->hrtick_csd_pending = 0; 418 rq->hrtick_csd_pending = 0;
417 raw_spin_unlock(&rq->lock); 419 raw_spin_unlock(&rq->lock);
418} 420}
@@ -430,7 +432,7 @@ void hrtick_start(struct rq *rq, u64 delay)
430 hrtimer_set_expires(timer, time); 432 hrtimer_set_expires(timer, time);
431 433
432 if (rq == this_rq()) { 434 if (rq == this_rq()) {
433 hrtimer_restart(timer); 435 __hrtick_restart(rq);
434 } else if (!rq->hrtick_csd_pending) { 436 } else if (!rq->hrtick_csd_pending) {
435 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); 437 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
436 rq->hrtick_csd_pending = 1; 438 rq->hrtick_csd_pending = 1;
@@ -679,7 +681,7 @@ void sched_avg_update(struct rq *rq)
679{ 681{
680 s64 period = sched_avg_period(); 682 s64 period = sched_avg_period();
681 683
682 while ((s64)(rq->clock - rq->age_stamp) > period) { 684 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
683 /* 685 /*
684 * Inline assembly required to prevent the compiler 686 * Inline assembly required to prevent the compiler
685 * optimising this loop into a divmod call. 687 * optimising this loop into a divmod call.
@@ -931,6 +933,8 @@ static int effective_prio(struct task_struct *p)
931/** 933/**
932 * task_curr - is this task currently executing on a CPU? 934 * task_curr - is this task currently executing on a CPU?
933 * @p: the task in question. 935 * @p: the task in question.
936 *
937 * Return: 1 if the task is currently executing. 0 otherwise.
934 */ 938 */
935inline int task_curr(const struct task_struct *p) 939inline int task_curr(const struct task_struct *p)
936{ 940{
@@ -1340,7 +1344,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1340 p->sched_class->task_woken(rq, p); 1344 p->sched_class->task_woken(rq, p);
1341 1345
1342 if (rq->idle_stamp) { 1346 if (rq->idle_stamp) {
1343 u64 delta = rq->clock - rq->idle_stamp; 1347 u64 delta = rq_clock(rq) - rq->idle_stamp;
1344 u64 max = 2*sysctl_sched_migration_cost; 1348 u64 max = 2*sysctl_sched_migration_cost;
1345 1349
1346 if (delta > max) 1350 if (delta > max)
@@ -1377,6 +1381,8 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
1377 1381
1378 rq = __task_rq_lock(p); 1382 rq = __task_rq_lock(p);
1379 if (p->on_rq) { 1383 if (p->on_rq) {
1384 /* check_preempt_curr() may use rq clock */
1385 update_rq_clock(rq);
1380 ttwu_do_wakeup(rq, p, wake_flags); 1386 ttwu_do_wakeup(rq, p, wake_flags);
1381 ret = 1; 1387 ret = 1;
1382 } 1388 }
@@ -1478,7 +1484,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
1478 * the simpler "current->state = TASK_RUNNING" to mark yourself 1484 * the simpler "current->state = TASK_RUNNING" to mark yourself
1479 * runnable without the overhead of this. 1485 * runnable without the overhead of this.
1480 * 1486 *
1481 * Returns %true if @p was woken up, %false if it was already running 1487 * Return: %true if @p was woken up, %false if it was already running.
1482 * or @state didn't match @p's state. 1488 * or @state didn't match @p's state.
1483 */ 1489 */
1484static int 1490static int
@@ -1487,7 +1493,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1487 unsigned long flags; 1493 unsigned long flags;
1488 int cpu, success = 0; 1494 int cpu, success = 0;
1489 1495
1490 smp_wmb(); 1496 /*
1497 * If we are going to wake up a thread waiting for CONDITION we
1498 * need to ensure that CONDITION=1 done by the caller can not be
1499 * reordered with p->state check below. This pairs with mb() in
1500 * set_current_state() the waiting thread does.
1501 */
1502 smp_mb__before_spinlock();
1491 raw_spin_lock_irqsave(&p->pi_lock, flags); 1503 raw_spin_lock_irqsave(&p->pi_lock, flags);
1492 if (!(p->state & state)) 1504 if (!(p->state & state))
1493 goto out; 1505 goto out;
@@ -1573,8 +1585,9 @@ out:
1573 * @p: The process to be woken up. 1585 * @p: The process to be woken up.
1574 * 1586 *
1575 * Attempt to wake up the nominated process and move it to the set of runnable 1587 * Attempt to wake up the nominated process and move it to the set of runnable
1576 * processes. Returns 1 if the process was woken up, 0 if it was already 1588 * processes.
1577 * running. 1589 *
1590 * Return: 1 if the process was woken up, 0 if it was already running.
1578 * 1591 *
1579 * It may be assumed that this function implies a write memory barrier before 1592 * It may be assumed that this function implies a write memory barrier before
1580 * changing the task state if and only if any tasks are woken up. 1593 * changing the task state if and only if any tasks are woken up.
@@ -1609,15 +1622,6 @@ static void __sched_fork(struct task_struct *p)
1609 p->se.vruntime = 0; 1622 p->se.vruntime = 0;
1610 INIT_LIST_HEAD(&p->se.group_node); 1623 INIT_LIST_HEAD(&p->se.group_node);
1611 1624
1612/*
1613 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
1614 * removed when useful for applications beyond shares distribution (e.g.
1615 * load-balance).
1616 */
1617#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1618 p->se.avg.runnable_avg_period = 0;
1619 p->se.avg.runnable_avg_sum = 0;
1620#endif
1621#ifdef CONFIG_SCHEDSTATS 1625#ifdef CONFIG_SCHEDSTATS
1622 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1626 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1623#endif 1627#endif
@@ -1761,6 +1765,8 @@ void wake_up_new_task(struct task_struct *p)
1761 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); 1765 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
1762#endif 1766#endif
1763 1767
1768 /* Initialize new task's runnable average */
1769 init_task_runnable_average(p);
1764 rq = __task_rq_lock(p); 1770 rq = __task_rq_lock(p);
1765 activate_task(rq, p, 0); 1771 activate_task(rq, p, 0);
1766 p->on_rq = 1; 1772 p->on_rq = 1;
@@ -2069,575 +2075,6 @@ unsigned long nr_iowait_cpu(int cpu)
2069 return atomic_read(&this->nr_iowait); 2075 return atomic_read(&this->nr_iowait);
2070} 2076}
2071 2077
2072unsigned long this_cpu_load(void)
2073{
2074 struct rq *this = this_rq();
2075 return this->cpu_load[0];
2076}
2077
2078
2079/*
2080 * Global load-average calculations
2081 *
2082 * We take a distributed and async approach to calculating the global load-avg
2083 * in order to minimize overhead.
2084 *
2085 * The global load average is an exponentially decaying average of nr_running +
2086 * nr_uninterruptible.
2087 *
2088 * Once every LOAD_FREQ:
2089 *
2090 * nr_active = 0;
2091 * for_each_possible_cpu(cpu)
2092 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
2093 *
2094 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
2095 *
2096 * Due to a number of reasons the above turns in the mess below:
2097 *
2098 * - for_each_possible_cpu() is prohibitively expensive on machines with
2099 * serious number of cpus, therefore we need to take a distributed approach
2100 * to calculating nr_active.
2101 *
2102 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
2103 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
2104 *
2105 * So assuming nr_active := 0 when we start out -- true per definition, we
2106 * can simply take per-cpu deltas and fold those into a global accumulate
2107 * to obtain the same result. See calc_load_fold_active().
2108 *
2109 * Furthermore, in order to avoid synchronizing all per-cpu delta folding
2110 * across the machine, we assume 10 ticks is sufficient time for every
2111 * cpu to have completed this task.
2112 *
2113 * This places an upper-bound on the IRQ-off latency of the machine. Then
2114 * again, being late doesn't loose the delta, just wrecks the sample.
2115 *
2116 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
2117 * this would add another cross-cpu cacheline miss and atomic operation
2118 * to the wakeup path. Instead we increment on whatever cpu the task ran
2119 * when it went into uninterruptible state and decrement on whatever cpu
2120 * did the wakeup. This means that only the sum of nr_uninterruptible over
2121 * all cpus yields the correct result.
2122 *
2123 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
2124 */
2125
2126/* Variables and functions for calc_load */
2127static atomic_long_t calc_load_tasks;
2128static unsigned long calc_load_update;
2129unsigned long avenrun[3];
2130EXPORT_SYMBOL(avenrun); /* should be removed */
2131
2132/**
2133 * get_avenrun - get the load average array
2134 * @loads: pointer to dest load array
2135 * @offset: offset to add
2136 * @shift: shift count to shift the result left
2137 *
2138 * These values are estimates at best, so no need for locking.
2139 */
2140void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2141{
2142 loads[0] = (avenrun[0] + offset) << shift;
2143 loads[1] = (avenrun[1] + offset) << shift;
2144 loads[2] = (avenrun[2] + offset) << shift;
2145}
2146
2147static long calc_load_fold_active(struct rq *this_rq)
2148{
2149 long nr_active, delta = 0;
2150
2151 nr_active = this_rq->nr_running;
2152 nr_active += (long) this_rq->nr_uninterruptible;
2153
2154 if (nr_active != this_rq->calc_load_active) {
2155 delta = nr_active - this_rq->calc_load_active;
2156 this_rq->calc_load_active = nr_active;
2157 }
2158
2159 return delta;
2160}
2161
2162/*
2163 * a1 = a0 * e + a * (1 - e)
2164 */
2165static unsigned long
2166calc_load(unsigned long load, unsigned long exp, unsigned long active)
2167{
2168 load *= exp;
2169 load += active * (FIXED_1 - exp);
2170 load += 1UL << (FSHIFT - 1);
2171 return load >> FSHIFT;
2172}
2173
2174#ifdef CONFIG_NO_HZ_COMMON
2175/*
2176 * Handle NO_HZ for the global load-average.
2177 *
2178 * Since the above described distributed algorithm to compute the global
2179 * load-average relies on per-cpu sampling from the tick, it is affected by
2180 * NO_HZ.
2181 *
2182 * The basic idea is to fold the nr_active delta into a global idle-delta upon
2183 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
2184 * when we read the global state.
2185 *
2186 * Obviously reality has to ruin such a delightfully simple scheme:
2187 *
2188 * - When we go NO_HZ idle during the window, we can negate our sample
2189 * contribution, causing under-accounting.
2190 *
2191 * We avoid this by keeping two idle-delta counters and flipping them
2192 * when the window starts, thus separating old and new NO_HZ load.
2193 *
2194 * The only trick is the slight shift in index flip for read vs write.
2195 *
2196 * 0s 5s 10s 15s
2197 * +10 +10 +10 +10
2198 * |-|-----------|-|-----------|-|-----------|-|
2199 * r:0 0 1 1 0 0 1 1 0
2200 * w:0 1 1 0 0 1 1 0 0
2201 *
2202 * This ensures we'll fold the old idle contribution in this window while
2203 * accumlating the new one.
2204 *
2205 * - When we wake up from NO_HZ idle during the window, we push up our
2206 * contribution, since we effectively move our sample point to a known
2207 * busy state.
2208 *
2209 * This is solved by pushing the window forward, and thus skipping the
2210 * sample, for this cpu (effectively using the idle-delta for this cpu which
2211 * was in effect at the time the window opened). This also solves the issue
2212 * of having to deal with a cpu having been in NOHZ idle for multiple
2213 * LOAD_FREQ intervals.
2214 *
2215 * When making the ILB scale, we should try to pull this in as well.
2216 */
2217static atomic_long_t calc_load_idle[2];
2218static int calc_load_idx;
2219
2220static inline int calc_load_write_idx(void)
2221{
2222 int idx = calc_load_idx;
2223
2224 /*
2225 * See calc_global_nohz(), if we observe the new index, we also
2226 * need to observe the new update time.
2227 */
2228 smp_rmb();
2229
2230 /*
2231 * If the folding window started, make sure we start writing in the
2232 * next idle-delta.
2233 */
2234 if (!time_before(jiffies, calc_load_update))
2235 idx++;
2236
2237 return idx & 1;
2238}
2239
2240static inline int calc_load_read_idx(void)
2241{
2242 return calc_load_idx & 1;
2243}
2244
2245void calc_load_enter_idle(void)
2246{
2247 struct rq *this_rq = this_rq();
2248 long delta;
2249
2250 /*
2251 * We're going into NOHZ mode, if there's any pending delta, fold it
2252 * into the pending idle delta.
2253 */
2254 delta = calc_load_fold_active(this_rq);
2255 if (delta) {
2256 int idx = calc_load_write_idx();
2257 atomic_long_add(delta, &calc_load_idle[idx]);
2258 }
2259}
2260
2261void calc_load_exit_idle(void)
2262{
2263 struct rq *this_rq = this_rq();
2264
2265 /*
2266 * If we're still before the sample window, we're done.
2267 */
2268 if (time_before(jiffies, this_rq->calc_load_update))
2269 return;
2270
2271 /*
2272 * We woke inside or after the sample window, this means we're already
2273 * accounted through the nohz accounting, so skip the entire deal and
2274 * sync up for the next window.
2275 */
2276 this_rq->calc_load_update = calc_load_update;
2277 if (time_before(jiffies, this_rq->calc_load_update + 10))
2278 this_rq->calc_load_update += LOAD_FREQ;
2279}
2280
2281static long calc_load_fold_idle(void)
2282{
2283 int idx = calc_load_read_idx();
2284 long delta = 0;
2285
2286 if (atomic_long_read(&calc_load_idle[idx]))
2287 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2288
2289 return delta;
2290}
2291
2292/**
2293 * fixed_power_int - compute: x^n, in O(log n) time
2294 *
2295 * @x: base of the power
2296 * @frac_bits: fractional bits of @x
2297 * @n: power to raise @x to.
2298 *
2299 * By exploiting the relation between the definition of the natural power
2300 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
2301 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
2302 * (where: n_i \elem {0, 1}, the binary vector representing n),
2303 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
2304 * of course trivially computable in O(log_2 n), the length of our binary
2305 * vector.
2306 */
2307static unsigned long
2308fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
2309{
2310 unsigned long result = 1UL << frac_bits;
2311
2312 if (n) for (;;) {
2313 if (n & 1) {
2314 result *= x;
2315 result += 1UL << (frac_bits - 1);
2316 result >>= frac_bits;
2317 }
2318 n >>= 1;
2319 if (!n)
2320 break;
2321 x *= x;
2322 x += 1UL << (frac_bits - 1);
2323 x >>= frac_bits;
2324 }
2325
2326 return result;
2327}
2328
2329/*
2330 * a1 = a0 * e + a * (1 - e)
2331 *
2332 * a2 = a1 * e + a * (1 - e)
2333 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
2334 * = a0 * e^2 + a * (1 - e) * (1 + e)
2335 *
2336 * a3 = a2 * e + a * (1 - e)
2337 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
2338 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
2339 *
2340 * ...
2341 *
2342 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
2343 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
2344 * = a0 * e^n + a * (1 - e^n)
2345 *
2346 * [1] application of the geometric series:
2347 *
2348 * n 1 - x^(n+1)
2349 * S_n := \Sum x^i = -------------
2350 * i=0 1 - x
2351 */
2352static unsigned long
2353calc_load_n(unsigned long load, unsigned long exp,
2354 unsigned long active, unsigned int n)
2355{
2356
2357 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
2358}
2359
2360/*
2361 * NO_HZ can leave us missing all per-cpu ticks calling
2362 * calc_load_account_active(), but since an idle CPU folds its delta into
2363 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
2364 * in the pending idle delta if our idle period crossed a load cycle boundary.
2365 *
2366 * Once we've updated the global active value, we need to apply the exponential
2367 * weights adjusted to the number of cycles missed.
2368 */
2369static void calc_global_nohz(void)
2370{
2371 long delta, active, n;
2372
2373 if (!time_before(jiffies, calc_load_update + 10)) {
2374 /*
2375 * Catch-up, fold however many we are behind still
2376 */
2377 delta = jiffies - calc_load_update - 10;
2378 n = 1 + (delta / LOAD_FREQ);
2379
2380 active = atomic_long_read(&calc_load_tasks);
2381 active = active > 0 ? active * FIXED_1 : 0;
2382
2383 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2384 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2385 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2386
2387 calc_load_update += n * LOAD_FREQ;
2388 }
2389
2390 /*
2391 * Flip the idle index...
2392 *
2393 * Make sure we first write the new time then flip the index, so that
2394 * calc_load_write_idx() will see the new time when it reads the new
2395 * index, this avoids a double flip messing things up.
2396 */
2397 smp_wmb();
2398 calc_load_idx++;
2399}
2400#else /* !CONFIG_NO_HZ_COMMON */
2401
2402static inline long calc_load_fold_idle(void) { return 0; }
2403static inline void calc_global_nohz(void) { }
2404
2405#endif /* CONFIG_NO_HZ_COMMON */
2406
2407/*
2408 * calc_load - update the avenrun load estimates 10 ticks after the
2409 * CPUs have updated calc_load_tasks.
2410 */
2411void calc_global_load(unsigned long ticks)
2412{
2413 long active, delta;
2414
2415 if (time_before(jiffies, calc_load_update + 10))
2416 return;
2417
2418 /*
2419 * Fold the 'old' idle-delta to include all NO_HZ cpus.
2420 */
2421 delta = calc_load_fold_idle();
2422 if (delta)
2423 atomic_long_add(delta, &calc_load_tasks);
2424
2425 active = atomic_long_read(&calc_load_tasks);
2426 active = active > 0 ? active * FIXED_1 : 0;
2427
2428 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2429 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2430 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2431
2432 calc_load_update += LOAD_FREQ;
2433
2434 /*
2435 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
2436 */
2437 calc_global_nohz();
2438}
2439
2440/*
2441 * Called from update_cpu_load() to periodically update this CPU's
2442 * active count.
2443 */
2444static void calc_load_account_active(struct rq *this_rq)
2445{
2446 long delta;
2447
2448 if (time_before(jiffies, this_rq->calc_load_update))
2449 return;
2450
2451 delta = calc_load_fold_active(this_rq);
2452 if (delta)
2453 atomic_long_add(delta, &calc_load_tasks);
2454
2455 this_rq->calc_load_update += LOAD_FREQ;
2456}
2457
2458/*
2459 * End of global load-average stuff
2460 */
2461
2462/*
2463 * The exact cpuload at various idx values, calculated at every tick would be
2464 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
2465 *
2466 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
2467 * on nth tick when cpu may be busy, then we have:
2468 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2469 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
2470 *
2471 * decay_load_missed() below does efficient calculation of
2472 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2473 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
2474 *
2475 * The calculation is approximated on a 128 point scale.
2476 * degrade_zero_ticks is the number of ticks after which load at any
2477 * particular idx is approximated to be zero.
2478 * degrade_factor is a precomputed table, a row for each load idx.
2479 * Each column corresponds to degradation factor for a power of two ticks,
2480 * based on 128 point scale.
2481 * Example:
2482 * row 2, col 3 (=12) says that the degradation at load idx 2 after
2483 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
2484 *
2485 * With this power of 2 load factors, we can degrade the load n times
2486 * by looking at 1 bits in n and doing as many mult/shift instead of
2487 * n mult/shifts needed by the exact degradation.
2488 */
2489#define DEGRADE_SHIFT 7
2490static const unsigned char
2491 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
2492static const unsigned char
2493 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
2494 {0, 0, 0, 0, 0, 0, 0, 0},
2495 {64, 32, 8, 0, 0, 0, 0, 0},
2496 {96, 72, 40, 12, 1, 0, 0},
2497 {112, 98, 75, 43, 15, 1, 0},
2498 {120, 112, 98, 76, 45, 16, 2} };
2499
2500/*
2501 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
2502 * would be when CPU is idle and so we just decay the old load without
2503 * adding any new load.
2504 */
2505static unsigned long
2506decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2507{
2508 int j = 0;
2509
2510 if (!missed_updates)
2511 return load;
2512
2513 if (missed_updates >= degrade_zero_ticks[idx])
2514 return 0;
2515
2516 if (idx == 1)
2517 return load >> missed_updates;
2518
2519 while (missed_updates) {
2520 if (missed_updates % 2)
2521 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
2522
2523 missed_updates >>= 1;
2524 j++;
2525 }
2526 return load;
2527}
2528
2529/*
2530 * Update rq->cpu_load[] statistics. This function is usually called every
2531 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2532 * every tick. We fix it up based on jiffies.
2533 */
2534static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2535 unsigned long pending_updates)
2536{
2537 int i, scale;
2538
2539 this_rq->nr_load_updates++;
2540
2541 /* Update our load: */
2542 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2543 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2544 unsigned long old_load, new_load;
2545
2546 /* scale is effectively 1 << i now, and >> i divides by scale */
2547
2548 old_load = this_rq->cpu_load[i];
2549 old_load = decay_load_missed(old_load, pending_updates - 1, i);
2550 new_load = this_load;
2551 /*
2552 * Round up the averaging division if load is increasing. This
2553 * prevents us from getting stuck on 9 if the load is 10, for
2554 * example.
2555 */
2556 if (new_load > old_load)
2557 new_load += scale - 1;
2558
2559 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
2560 }
2561
2562 sched_avg_update(this_rq);
2563}
2564
2565#ifdef CONFIG_NO_HZ_COMMON
2566/*
2567 * There is no sane way to deal with nohz on smp when using jiffies because the
2568 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
2569 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
2570 *
2571 * Therefore we cannot use the delta approach from the regular tick since that
2572 * would seriously skew the load calculation. However we'll make do for those
2573 * updates happening while idle (nohz_idle_balance) or coming out of idle
2574 * (tick_nohz_idle_exit).
2575 *
2576 * This means we might still be one tick off for nohz periods.
2577 */
2578
2579/*
2580 * Called from nohz_idle_balance() to update the load ratings before doing the
2581 * idle balance.
2582 */
2583void update_idle_cpu_load(struct rq *this_rq)
2584{
2585 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2586 unsigned long load = this_rq->load.weight;
2587 unsigned long pending_updates;
2588
2589 /*
2590 * bail if there's load or we're actually up-to-date.
2591 */
2592 if (load || curr_jiffies == this_rq->last_load_update_tick)
2593 return;
2594
2595 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2596 this_rq->last_load_update_tick = curr_jiffies;
2597
2598 __update_cpu_load(this_rq, load, pending_updates);
2599}
2600
2601/*
2602 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
2603 */
2604void update_cpu_load_nohz(void)
2605{
2606 struct rq *this_rq = this_rq();
2607 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2608 unsigned long pending_updates;
2609
2610 if (curr_jiffies == this_rq->last_load_update_tick)
2611 return;
2612
2613 raw_spin_lock(&this_rq->lock);
2614 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2615 if (pending_updates) {
2616 this_rq->last_load_update_tick = curr_jiffies;
2617 /*
2618 * We were idle, this means load 0, the current load might be
2619 * !0 due to remote wakeups and the sort.
2620 */
2621 __update_cpu_load(this_rq, 0, pending_updates);
2622 }
2623 raw_spin_unlock(&this_rq->lock);
2624}
2625#endif /* CONFIG_NO_HZ_COMMON */
2626
2627/*
2628 * Called from scheduler_tick()
2629 */
2630static void update_cpu_load_active(struct rq *this_rq)
2631{
2632 /*
2633 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
2634 */
2635 this_rq->last_load_update_tick = jiffies;
2636 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2637
2638 calc_load_account_active(this_rq);
2639}
2640
2641#ifdef CONFIG_SMP 2078#ifdef CONFIG_SMP
2642 2079
2643/* 2080/*
@@ -2686,7 +2123,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2686 2123
2687 if (task_current(rq, p)) { 2124 if (task_current(rq, p)) {
2688 update_rq_clock(rq); 2125 update_rq_clock(rq);
2689 ns = rq->clock_task - p->se.exec_start; 2126 ns = rq_clock_task(rq) - p->se.exec_start;
2690 if ((s64)ns < 0) 2127 if ((s64)ns < 0)
2691 ns = 0; 2128 ns = 0;
2692 } 2129 }
@@ -2739,8 +2176,8 @@ void scheduler_tick(void)
2739 2176
2740 raw_spin_lock(&rq->lock); 2177 raw_spin_lock(&rq->lock);
2741 update_rq_clock(rq); 2178 update_rq_clock(rq);
2742 update_cpu_load_active(rq);
2743 curr->sched_class->task_tick(rq, curr, 0); 2179 curr->sched_class->task_tick(rq, curr, 0);
2180 update_cpu_load_active(rq);
2744 raw_spin_unlock(&rq->lock); 2181 raw_spin_unlock(&rq->lock);
2745 2182
2746 perf_event_task_tick(); 2183 perf_event_task_tick();
@@ -2763,6 +2200,8 @@ void scheduler_tick(void)
2763 * This makes sure that uptime, CFS vruntime, load 2200 * This makes sure that uptime, CFS vruntime, load
2764 * balancing, etc... continue to move forward, even 2201 * balancing, etc... continue to move forward, even
2765 * with a very low granularity. 2202 * with a very low granularity.
2203 *
2204 * Return: Maximum deferment in nanoseconds.
2766 */ 2205 */
2767u64 scheduler_tick_max_deferment(void) 2206u64 scheduler_tick_max_deferment(void)
2768{ 2207{
@@ -2966,6 +2405,12 @@ need_resched:
2966 if (sched_feat(HRTICK)) 2405 if (sched_feat(HRTICK))
2967 hrtick_clear(rq); 2406 hrtick_clear(rq);
2968 2407
2408 /*
2409 * Make sure that signal_pending_state()->signal_pending() below
2410 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
2411 * done by the caller to avoid the race with signal_wake_up().
2412 */
2413 smp_mb__before_spinlock();
2969 raw_spin_lock_irq(&rq->lock); 2414 raw_spin_lock_irq(&rq->lock);
2970 2415
2971 switch_count = &prev->nivcsw; 2416 switch_count = &prev->nivcsw;
@@ -3368,8 +2813,8 @@ EXPORT_SYMBOL(wait_for_completion);
3368 * specified timeout to expire. The timeout is in jiffies. It is not 2813 * specified timeout to expire. The timeout is in jiffies. It is not
3369 * interruptible. 2814 * interruptible.
3370 * 2815 *
3371 * The return value is 0 if timed out, and positive (at least 1, or number of 2816 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
3372 * jiffies left till timeout) if completed. 2817 * till timeout) if completed.
3373 */ 2818 */
3374unsigned long __sched 2819unsigned long __sched
3375wait_for_completion_timeout(struct completion *x, unsigned long timeout) 2820wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@ -3401,8 +2846,8 @@ EXPORT_SYMBOL(wait_for_completion_io);
3401 * specified timeout to expire. The timeout is in jiffies. It is not 2846 * specified timeout to expire. The timeout is in jiffies. It is not
3402 * interruptible. The caller is accounted as waiting for IO. 2847 * interruptible. The caller is accounted as waiting for IO.
3403 * 2848 *
3404 * The return value is 0 if timed out, and positive (at least 1, or number of 2849 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
3405 * jiffies left till timeout) if completed. 2850 * till timeout) if completed.
3406 */ 2851 */
3407unsigned long __sched 2852unsigned long __sched
3408wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) 2853wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
@@ -3418,7 +2863,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout);
3418 * This waits for completion of a specific task to be signaled. It is 2863 * This waits for completion of a specific task to be signaled. It is
3419 * interruptible. 2864 * interruptible.
3420 * 2865 *
3421 * The return value is -ERESTARTSYS if interrupted, 0 if completed. 2866 * Return: -ERESTARTSYS if interrupted, 0 if completed.
3422 */ 2867 */
3423int __sched wait_for_completion_interruptible(struct completion *x) 2868int __sched wait_for_completion_interruptible(struct completion *x)
3424{ 2869{
@@ -3437,8 +2882,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
3437 * This waits for either a completion of a specific task to be signaled or for a 2882 * This waits for either a completion of a specific task to be signaled or for a
3438 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 2883 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
3439 * 2884 *
3440 * The return value is -ERESTARTSYS if interrupted, 0 if timed out, 2885 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
3441 * positive (at least 1, or number of jiffies left till timeout) if completed. 2886 * or number of jiffies left till timeout) if completed.
3442 */ 2887 */
3443long __sched 2888long __sched
3444wait_for_completion_interruptible_timeout(struct completion *x, 2889wait_for_completion_interruptible_timeout(struct completion *x,
@@ -3455,7 +2900,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3455 * This waits to be signaled for completion of a specific task. It can be 2900 * This waits to be signaled for completion of a specific task. It can be
3456 * interrupted by a kill signal. 2901 * interrupted by a kill signal.
3457 * 2902 *
3458 * The return value is -ERESTARTSYS if interrupted, 0 if completed. 2903 * Return: -ERESTARTSYS if interrupted, 0 if completed.
3459 */ 2904 */
3460int __sched wait_for_completion_killable(struct completion *x) 2905int __sched wait_for_completion_killable(struct completion *x)
3461{ 2906{
@@ -3475,8 +2920,8 @@ EXPORT_SYMBOL(wait_for_completion_killable);
3475 * signaled or for a specified timeout to expire. It can be 2920 * signaled or for a specified timeout to expire. It can be
3476 * interrupted by a kill signal. The timeout is in jiffies. 2921 * interrupted by a kill signal. The timeout is in jiffies.
3477 * 2922 *
3478 * The return value is -ERESTARTSYS if interrupted, 0 if timed out, 2923 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
3479 * positive (at least 1, or number of jiffies left till timeout) if completed. 2924 * or number of jiffies left till timeout) if completed.
3480 */ 2925 */
3481long __sched 2926long __sched
3482wait_for_completion_killable_timeout(struct completion *x, 2927wait_for_completion_killable_timeout(struct completion *x,
@@ -3490,7 +2935,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout);
3490 * try_wait_for_completion - try to decrement a completion without blocking 2935 * try_wait_for_completion - try to decrement a completion without blocking
3491 * @x: completion structure 2936 * @x: completion structure
3492 * 2937 *
3493 * Returns: 0 if a decrement cannot be done without blocking 2938 * Return: 0 if a decrement cannot be done without blocking
3494 * 1 if a decrement succeeded. 2939 * 1 if a decrement succeeded.
3495 * 2940 *
3496 * If a completion is being used as a counting completion, 2941 * If a completion is being used as a counting completion,
@@ -3517,7 +2962,7 @@ EXPORT_SYMBOL(try_wait_for_completion);
3517 * completion_done - Test to see if a completion has any waiters 2962 * completion_done - Test to see if a completion has any waiters
3518 * @x: completion structure 2963 * @x: completion structure
3519 * 2964 *
3520 * Returns: 0 if there are waiters (wait_for_completion() in progress) 2965 * Return: 0 if there are waiters (wait_for_completion() in progress)
3521 * 1 if there are no waiters. 2966 * 1 if there are no waiters.
3522 * 2967 *
3523 */ 2968 */
@@ -3754,7 +3199,7 @@ SYSCALL_DEFINE1(nice, int, increment)
3754 * task_prio - return the priority value of a given task. 3199 * task_prio - return the priority value of a given task.
3755 * @p: the task in question. 3200 * @p: the task in question.
3756 * 3201 *
3757 * This is the priority value as seen by users in /proc. 3202 * Return: The priority value as seen by users in /proc.
3758 * RT tasks are offset by -200. Normal tasks are centered 3203 * RT tasks are offset by -200. Normal tasks are centered
3759 * around 0, value goes from -16 to +15. 3204 * around 0, value goes from -16 to +15.
3760 */ 3205 */
@@ -3766,6 +3211,8 @@ int task_prio(const struct task_struct *p)
3766/** 3211/**
3767 * task_nice - return the nice value of a given task. 3212 * task_nice - return the nice value of a given task.
3768 * @p: the task in question. 3213 * @p: the task in question.
3214 *
3215 * Return: The nice value [ -20 ... 0 ... 19 ].
3769 */ 3216 */
3770int task_nice(const struct task_struct *p) 3217int task_nice(const struct task_struct *p)
3771{ 3218{
@@ -3776,6 +3223,8 @@ EXPORT_SYMBOL(task_nice);
3776/** 3223/**
3777 * idle_cpu - is a given cpu idle currently? 3224 * idle_cpu - is a given cpu idle currently?
3778 * @cpu: the processor in question. 3225 * @cpu: the processor in question.
3226 *
3227 * Return: 1 if the CPU is currently idle. 0 otherwise.
3779 */ 3228 */
3780int idle_cpu(int cpu) 3229int idle_cpu(int cpu)
3781{ 3230{
@@ -3798,6 +3247,8 @@ int idle_cpu(int cpu)
3798/** 3247/**
3799 * idle_task - return the idle task for a given cpu. 3248 * idle_task - return the idle task for a given cpu.
3800 * @cpu: the processor in question. 3249 * @cpu: the processor in question.
3250 *
3251 * Return: The idle task for the cpu @cpu.
3801 */ 3252 */
3802struct task_struct *idle_task(int cpu) 3253struct task_struct *idle_task(int cpu)
3803{ 3254{
@@ -3807,6 +3258,8 @@ struct task_struct *idle_task(int cpu)
3807/** 3258/**
3808 * find_process_by_pid - find a process with a matching PID value. 3259 * find_process_by_pid - find a process with a matching PID value.
3809 * @pid: the pid in question. 3260 * @pid: the pid in question.
3261 *
3262 * The task of @pid, if found. %NULL otherwise.
3810 */ 3263 */
3811static struct task_struct *find_process_by_pid(pid_t pid) 3264static struct task_struct *find_process_by_pid(pid_t pid)
3812{ 3265{
@@ -4004,6 +3457,8 @@ recheck:
4004 * @policy: new policy. 3457 * @policy: new policy.
4005 * @param: structure containing the new RT priority. 3458 * @param: structure containing the new RT priority.
4006 * 3459 *
3460 * Return: 0 on success. An error code otherwise.
3461 *
4007 * NOTE that the task may be already dead. 3462 * NOTE that the task may be already dead.
4008 */ 3463 */
4009int sched_setscheduler(struct task_struct *p, int policy, 3464int sched_setscheduler(struct task_struct *p, int policy,
@@ -4023,6 +3478,8 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
4023 * current context has permission. For example, this is needed in 3478 * current context has permission. For example, this is needed in
4024 * stop_machine(): we create temporary high priority worker threads, 3479 * stop_machine(): we create temporary high priority worker threads,
4025 * but our caller might not have that capability. 3480 * but our caller might not have that capability.
3481 *
3482 * Return: 0 on success. An error code otherwise.
4026 */ 3483 */
4027int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3484int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4028 const struct sched_param *param) 3485 const struct sched_param *param)
@@ -4057,6 +3514,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4057 * @pid: the pid in question. 3514 * @pid: the pid in question.
4058 * @policy: new policy. 3515 * @policy: new policy.
4059 * @param: structure containing the new RT priority. 3516 * @param: structure containing the new RT priority.
3517 *
3518 * Return: 0 on success. An error code otherwise.
4060 */ 3519 */
4061SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 3520SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
4062 struct sched_param __user *, param) 3521 struct sched_param __user *, param)
@@ -4072,6 +3531,8 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
4072 * sys_sched_setparam - set/change the RT priority of a thread 3531 * sys_sched_setparam - set/change the RT priority of a thread
4073 * @pid: the pid in question. 3532 * @pid: the pid in question.
4074 * @param: structure containing the new RT priority. 3533 * @param: structure containing the new RT priority.
3534 *
3535 * Return: 0 on success. An error code otherwise.
4075 */ 3536 */
4076SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 3537SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4077{ 3538{
@@ -4081,6 +3542,9 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4081/** 3542/**
4082 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3543 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4083 * @pid: the pid in question. 3544 * @pid: the pid in question.
3545 *
3546 * Return: On success, the policy of the thread. Otherwise, a negative error
3547 * code.
4084 */ 3548 */
4085SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 3549SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4086{ 3550{
@@ -4107,6 +3571,9 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4107 * sys_sched_getparam - get the RT priority of a thread 3571 * sys_sched_getparam - get the RT priority of a thread
4108 * @pid: the pid in question. 3572 * @pid: the pid in question.
4109 * @param: structure containing the RT priority. 3573 * @param: structure containing the RT priority.
3574 *
3575 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
3576 * code.
4110 */ 3577 */
4111SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 3578SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4112{ 3579{
@@ -4231,6 +3698,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4231 * @pid: pid of the process 3698 * @pid: pid of the process
4232 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 3699 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4233 * @user_mask_ptr: user-space pointer to the new cpu mask 3700 * @user_mask_ptr: user-space pointer to the new cpu mask
3701 *
3702 * Return: 0 on success. An error code otherwise.
4234 */ 3703 */
4235SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 3704SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4236 unsigned long __user *, user_mask_ptr) 3705 unsigned long __user *, user_mask_ptr)
@@ -4282,6 +3751,8 @@ out_unlock:
4282 * @pid: pid of the process 3751 * @pid: pid of the process
4283 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 3752 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4284 * @user_mask_ptr: user-space pointer to hold the current cpu mask 3753 * @user_mask_ptr: user-space pointer to hold the current cpu mask
3754 *
3755 * Return: 0 on success. An error code otherwise.
4285 */ 3756 */
4286SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 3757SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4287 unsigned long __user *, user_mask_ptr) 3758 unsigned long __user *, user_mask_ptr)
@@ -4316,6 +3787,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4316 * 3787 *
4317 * This function yields the current CPU to other tasks. If there are no 3788 * This function yields the current CPU to other tasks. If there are no
4318 * other threads running on this CPU then this function will return. 3789 * other threads running on this CPU then this function will return.
3790 *
3791 * Return: 0.
4319 */ 3792 */
4320SYSCALL_DEFINE0(sched_yield) 3793SYSCALL_DEFINE0(sched_yield)
4321{ 3794{
@@ -4441,7 +3914,7 @@ EXPORT_SYMBOL(yield);
4441 * It's the caller's job to ensure that the target task struct 3914 * It's the caller's job to ensure that the target task struct
4442 * can't go away on us before we can do any checks. 3915 * can't go away on us before we can do any checks.
4443 * 3916 *
4444 * Returns: 3917 * Return:
4445 * true (>0) if we indeed boosted the target task. 3918 * true (>0) if we indeed boosted the target task.
4446 * false (0) if we failed to boost the target. 3919 * false (0) if we failed to boost the target.
4447 * -ESRCH if there's no task to yield to. 3920 * -ESRCH if there's no task to yield to.
@@ -4544,8 +4017,9 @@ long __sched io_schedule_timeout(long timeout)
4544 * sys_sched_get_priority_max - return maximum RT priority. 4017 * sys_sched_get_priority_max - return maximum RT priority.
4545 * @policy: scheduling class. 4018 * @policy: scheduling class.
4546 * 4019 *
4547 * this syscall returns the maximum rt_priority that can be used 4020 * Return: On success, this syscall returns the maximum
4548 * by a given scheduling class. 4021 * rt_priority that can be used by a given scheduling class.
4022 * On failure, a negative error code is returned.
4549 */ 4023 */
4550SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 4024SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4551{ 4025{
@@ -4569,8 +4043,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4569 * sys_sched_get_priority_min - return minimum RT priority. 4043 * sys_sched_get_priority_min - return minimum RT priority.
4570 * @policy: scheduling class. 4044 * @policy: scheduling class.
4571 * 4045 *
4572 * this syscall returns the minimum rt_priority that can be used 4046 * Return: On success, this syscall returns the minimum
4573 * by a given scheduling class. 4047 * rt_priority that can be used by a given scheduling class.
4048 * On failure, a negative error code is returned.
4574 */ 4049 */
4575SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 4050SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4576{ 4051{
@@ -4596,6 +4071,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4596 * 4071 *
4597 * this syscall writes the default timeslice value of a given process 4072 * this syscall writes the default timeslice value of a given process
4598 * into the user-space timespec buffer. A value of '0' means infinity. 4073 * into the user-space timespec buffer. A value of '0' means infinity.
4074 *
4075 * Return: On success, 0 and the timeslice is in @interval. Otherwise,
4076 * an error code.
4599 */ 4077 */
4600SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 4078SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4601 struct timespec __user *, interval) 4079 struct timespec __user *, interval)
@@ -4705,7 +4183,7 @@ void show_state_filter(unsigned long state_filter)
4705 debug_show_all_locks(); 4183 debug_show_all_locks();
4706} 4184}
4707 4185
4708void __cpuinit init_idle_bootup_task(struct task_struct *idle) 4186void init_idle_bootup_task(struct task_struct *idle)
4709{ 4187{
4710 idle->sched_class = &idle_sched_class; 4188 idle->sched_class = &idle_sched_class;
4711} 4189}
@@ -4718,7 +4196,7 @@ void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4718 * NOTE: this function does not set the idle thread's NEED_RESCHED 4196 * NOTE: this function does not set the idle thread's NEED_RESCHED
4719 * flag, to make booting more robust. 4197 * flag, to make booting more robust.
4720 */ 4198 */
4721void __cpuinit init_idle(struct task_struct *idle, int cpu) 4199void init_idle(struct task_struct *idle, int cpu)
4722{ 4200{
4723 struct rq *rq = cpu_rq(cpu); 4201 struct rq *rq = cpu_rq(cpu);
4724 unsigned long flags; 4202 unsigned long flags;
@@ -4960,6 +4438,13 @@ static void migrate_tasks(unsigned int dead_cpu)
4960 */ 4438 */
4961 rq->stop = NULL; 4439 rq->stop = NULL;
4962 4440
4441 /*
4442 * put_prev_task() and pick_next_task() sched
4443 * class method both need to have an up-to-date
4444 * value of rq->clock[_task]
4445 */
4446 update_rq_clock(rq);
4447
4963 for ( ; ; ) { 4448 for ( ; ; ) {
4964 /* 4449 /*
4965 * There's this thread running, bail when that's the only 4450 * There's this thread running, bail when that's the only
@@ -5093,7 +4578,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
5093 return table; 4578 return table;
5094} 4579}
5095 4580
5096static ctl_table *sd_alloc_ctl_cpu_table(int cpu) 4581static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5097{ 4582{
5098 struct ctl_table *entry, *table; 4583 struct ctl_table *entry, *table;
5099 struct sched_domain *sd; 4584 struct sched_domain *sd;
@@ -5195,7 +4680,7 @@ static void set_rq_offline(struct rq *rq)
5195 * migration_call - callback that gets triggered when a CPU is added. 4680 * migration_call - callback that gets triggered when a CPU is added.
5196 * Here we can start up the necessary migration thread for the new CPU. 4681 * Here we can start up the necessary migration thread for the new CPU.
5197 */ 4682 */
5198static int __cpuinit 4683static int
5199migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 4684migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5200{ 4685{
5201 int cpu = (long)hcpu; 4686 int cpu = (long)hcpu;
@@ -5249,12 +4734,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5249 * happens before everything else. This has to be lower priority than 4734 * happens before everything else. This has to be lower priority than
5250 * the notifier in the perf_event subsystem, though. 4735 * the notifier in the perf_event subsystem, though.
5251 */ 4736 */
5252static struct notifier_block __cpuinitdata migration_notifier = { 4737static struct notifier_block migration_notifier = {
5253 .notifier_call = migration_call, 4738 .notifier_call = migration_call,
5254 .priority = CPU_PRI_MIGRATION, 4739 .priority = CPU_PRI_MIGRATION,
5255}; 4740};
5256 4741
5257static int __cpuinit sched_cpu_active(struct notifier_block *nfb, 4742static int sched_cpu_active(struct notifier_block *nfb,
5258 unsigned long action, void *hcpu) 4743 unsigned long action, void *hcpu)
5259{ 4744{
5260 switch (action & ~CPU_TASKS_FROZEN) { 4745 switch (action & ~CPU_TASKS_FROZEN) {
@@ -5267,7 +4752,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5267 } 4752 }
5268} 4753}
5269 4754
5270static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, 4755static int sched_cpu_inactive(struct notifier_block *nfb,
5271 unsigned long action, void *hcpu) 4756 unsigned long action, void *hcpu)
5272{ 4757{
5273 switch (action & ~CPU_TASKS_FROZEN) { 4758 switch (action & ~CPU_TASKS_FROZEN) {
@@ -5907,7 +5392,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
5907 get_group(cpu, sdd, &sd->groups); 5392 get_group(cpu, sdd, &sd->groups);
5908 atomic_inc(&sd->groups->ref); 5393 atomic_inc(&sd->groups->ref);
5909 5394
5910 if (cpu != cpumask_first(sched_domain_span(sd))) 5395 if (cpu != cpumask_first(span))
5911 return 0; 5396 return 0;
5912 5397
5913 lockdep_assert_held(&sched_domains_mutex); 5398 lockdep_assert_held(&sched_domains_mutex);
@@ -5917,12 +5402,12 @@ build_sched_groups(struct sched_domain *sd, int cpu)
5917 5402
5918 for_each_cpu(i, span) { 5403 for_each_cpu(i, span) {
5919 struct sched_group *sg; 5404 struct sched_group *sg;
5920 int group = get_group(i, sdd, &sg); 5405 int group, j;
5921 int j;
5922 5406
5923 if (cpumask_test_cpu(i, covered)) 5407 if (cpumask_test_cpu(i, covered))
5924 continue; 5408 continue;
5925 5409
5410 group = get_group(i, sdd, &sg);
5926 cpumask_clear(sched_group_cpus(sg)); 5411 cpumask_clear(sched_group_cpus(sg));
5927 sg->sgp->power = 0; 5412 sg->sgp->power = 0;
5928 cpumask_setall(sched_group_mask(sg)); 5413 cpumask_setall(sched_group_mask(sg));
@@ -5960,7 +5445,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5960{ 5445{
5961 struct sched_group *sg = sd->groups; 5446 struct sched_group *sg = sd->groups;
5962 5447
5963 WARN_ON(!sd || !sg); 5448 WARN_ON(!sg);
5964 5449
5965 do { 5450 do {
5966 sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 5451 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
@@ -6125,6 +5610,9 @@ static struct sched_domain_topology_level default_topology[] = {
6125 5610
6126static struct sched_domain_topology_level *sched_domain_topology = default_topology; 5611static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6127 5612
5613#define for_each_sd_topology(tl) \
5614 for (tl = sched_domain_topology; tl->init; tl++)
5615
6128#ifdef CONFIG_NUMA 5616#ifdef CONFIG_NUMA
6129 5617
6130static int sched_domains_numa_levels; 5618static int sched_domains_numa_levels;
@@ -6422,7 +5910,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6422 struct sched_domain_topology_level *tl; 5910 struct sched_domain_topology_level *tl;
6423 int j; 5911 int j;
6424 5912
6425 for (tl = sched_domain_topology; tl->init; tl++) { 5913 for_each_sd_topology(tl) {
6426 struct sd_data *sdd = &tl->data; 5914 struct sd_data *sdd = &tl->data;
6427 5915
6428 sdd->sd = alloc_percpu(struct sched_domain *); 5916 sdd->sd = alloc_percpu(struct sched_domain *);
@@ -6475,7 +5963,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
6475 struct sched_domain_topology_level *tl; 5963 struct sched_domain_topology_level *tl;
6476 int j; 5964 int j;
6477 5965
6478 for (tl = sched_domain_topology; tl->init; tl++) { 5966 for_each_sd_topology(tl) {
6479 struct sd_data *sdd = &tl->data; 5967 struct sd_data *sdd = &tl->data;
6480 5968
6481 for_each_cpu(j, cpu_map) { 5969 for_each_cpu(j, cpu_map) {
@@ -6503,9 +5991,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
6503} 5991}
6504 5992
6505struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 5993struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6506 struct s_data *d, const struct cpumask *cpu_map, 5994 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6507 struct sched_domain_attr *attr, struct sched_domain *child, 5995 struct sched_domain *child, int cpu)
6508 int cpu)
6509{ 5996{
6510 struct sched_domain *sd = tl->init(tl, cpu); 5997 struct sched_domain *sd = tl->init(tl, cpu);
6511 if (!sd) 5998 if (!sd)
@@ -6516,8 +6003,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6516 sd->level = child->level + 1; 6003 sd->level = child->level + 1;
6517 sched_domain_level_max = max(sched_domain_level_max, sd->level); 6004 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6518 child->parent = sd; 6005 child->parent = sd;
6006 sd->child = child;
6519 } 6007 }
6520 sd->child = child;
6521 set_domain_attribute(sd, attr); 6008 set_domain_attribute(sd, attr);
6522 6009
6523 return sd; 6010 return sd;
@@ -6530,7 +6017,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6530static int build_sched_domains(const struct cpumask *cpu_map, 6017static int build_sched_domains(const struct cpumask *cpu_map,
6531 struct sched_domain_attr *attr) 6018 struct sched_domain_attr *attr)
6532{ 6019{
6533 enum s_alloc alloc_state = sa_none; 6020 enum s_alloc alloc_state;
6534 struct sched_domain *sd; 6021 struct sched_domain *sd;
6535 struct s_data d; 6022 struct s_data d;
6536 int i, ret = -ENOMEM; 6023 int i, ret = -ENOMEM;
@@ -6544,18 +6031,15 @@ static int build_sched_domains(const struct cpumask *cpu_map,
6544 struct sched_domain_topology_level *tl; 6031 struct sched_domain_topology_level *tl;
6545 6032
6546 sd = NULL; 6033 sd = NULL;
6547 for (tl = sched_domain_topology; tl->init; tl++) { 6034 for_each_sd_topology(tl) {
6548 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); 6035 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
6036 if (tl == sched_domain_topology)
6037 *per_cpu_ptr(d.sd, i) = sd;
6549 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) 6038 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6550 sd->flags |= SD_OVERLAP; 6039 sd->flags |= SD_OVERLAP;
6551 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 6040 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6552 break; 6041 break;
6553 } 6042 }
6554
6555 while (sd->child)
6556 sd = sd->child;
6557
6558 *per_cpu_ptr(d.sd, i) = sd;
6559 } 6043 }
6560 6044
6561 /* Build the groups for the domains */ 6045 /* Build the groups for the domains */
@@ -6867,9 +6351,6 @@ void __init sched_init_smp(void)
6867 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 6351 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6868 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 6352 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6869 6353
6870 /* RT runtime code needs to handle some hotplug events */
6871 hotcpu_notifier(update_runtime, 0);
6872
6873 init_hrtick(); 6354 init_hrtick();
6874 6355
6875 /* Move init over to a non-isolated CPU */ 6356 /* Move init over to a non-isolated CPU */
@@ -7201,6 +6682,8 @@ void normalize_rt_tasks(void)
7201 * @cpu: the processor in question. 6682 * @cpu: the processor in question.
7202 * 6683 *
7203 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6684 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6685 *
6686 * Return: The current task for @cpu.
7204 */ 6687 */
7205struct task_struct *curr_task(int cpu) 6688struct task_struct *curr_task(int cpu)
7206{ 6689{
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 1095e878a46f..8b836b376d91 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -62,7 +62,7 @@ static int convert_prio(int prio)
62 * any discrepancies created by racing against the uncertainty of the current 62 * any discrepancies created by racing against the uncertainty of the current
63 * priority configuration. 63 * priority configuration.
64 * 64 *
65 * Returns: (int)bool - CPUs were found 65 * Return: (int)bool - CPUs were found
66 */ 66 */
67int cpupri_find(struct cpupri *cp, struct task_struct *p, 67int cpupri_find(struct cpupri *cp, struct task_struct *p,
68 struct cpumask *lowest_mask) 68 struct cpumask *lowest_mask)
@@ -203,7 +203,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
203 * cpupri_init - initialize the cpupri structure 203 * cpupri_init - initialize the cpupri structure
204 * @cp: The cpupri context 204 * @cp: The cpupri context
205 * 205 *
206 * Returns: -ENOMEM if memory fails. 206 * Return: -ENOMEM on memory allocation failure.
207 */ 207 */
208int cpupri_init(struct cpupri *cp) 208int cpupri_init(struct cpupri *cp)
209{ 209{
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index b5ccba22603b..a7959e05a9d5 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -515,9 +515,8 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
515 515
516 for (;;) { 516 for (;;) {
517 /* Make sure "rtime" is the bigger of stime/rtime */ 517 /* Make sure "rtime" is the bigger of stime/rtime */
518 if (stime > rtime) { 518 if (stime > rtime)
519 u64 tmp = rtime; rtime = stime; stime = tmp; 519 swap(rtime, stime);
520 }
521 520
522 /* Make sure 'total' fits in 32 bits */ 521 /* Make sure 'total' fits in 32 bits */
523 if (total >> 32) 522 if (total >> 32)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 75024a673520..e076bddd4c66 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -209,22 +209,24 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
209 cfs_rq->nr_spread_over); 209 cfs_rq->nr_spread_over);
210 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); 210 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
211 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 211 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
212#ifdef CONFIG_FAIR_GROUP_SCHED
213#ifdef CONFIG_SMP 212#ifdef CONFIG_SMP
214 SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", 213 SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg",
215 cfs_rq->runnable_load_avg); 214 cfs_rq->runnable_load_avg);
216 SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", 215 SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
217 cfs_rq->blocked_load_avg); 216 cfs_rq->blocked_load_avg);
218 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg", 217#ifdef CONFIG_FAIR_GROUP_SCHED
219 (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg)); 218 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
220 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
221 cfs_rq->tg_load_contrib); 219 cfs_rq->tg_load_contrib);
222 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", 220 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
223 cfs_rq->tg_runnable_contrib); 221 cfs_rq->tg_runnable_contrib);
222 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
223 atomic_long_read(&cfs_rq->tg->load_avg));
224 SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", 224 SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
225 atomic_read(&cfs_rq->tg->runnable_avg)); 225 atomic_read(&cfs_rq->tg->runnable_avg));
226#endif 226#endif
227#endif
227 228
229#ifdef CONFIG_FAIR_GROUP_SCHED
228 print_cfs_group_stats(m, cpu, cfs_rq->tg); 230 print_cfs_group_stats(m, cpu, cfs_rq->tg);
229#endif 231#endif
230} 232}
@@ -493,15 +495,16 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
493 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, 495 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
494 get_nr_threads(p)); 496 get_nr_threads(p));
495 SEQ_printf(m, 497 SEQ_printf(m,
496 "---------------------------------------------------------\n"); 498 "---------------------------------------------------------"
499 "----------\n");
497#define __P(F) \ 500#define __P(F) \
498 SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) 501 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
499#define P(F) \ 502#define P(F) \
500 SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) 503 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
501#define __PN(F) \ 504#define __PN(F) \
502 SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) 505 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
503#define PN(F) \ 506#define PN(F) \
504 SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) 507 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
505 508
506 PN(se.exec_start); 509 PN(se.exec_start);
507 PN(se.vruntime); 510 PN(se.vruntime);
@@ -560,12 +563,18 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
560 } 563 }
561#endif 564#endif
562 __P(nr_switches); 565 __P(nr_switches);
563 SEQ_printf(m, "%-35s:%21Ld\n", 566 SEQ_printf(m, "%-45s:%21Ld\n",
564 "nr_voluntary_switches", (long long)p->nvcsw); 567 "nr_voluntary_switches", (long long)p->nvcsw);
565 SEQ_printf(m, "%-35s:%21Ld\n", 568 SEQ_printf(m, "%-45s:%21Ld\n",
566 "nr_involuntary_switches", (long long)p->nivcsw); 569 "nr_involuntary_switches", (long long)p->nivcsw);
567 570
568 P(se.load.weight); 571 P(se.load.weight);
572#ifdef CONFIG_SMP
573 P(se.avg.runnable_avg_sum);
574 P(se.avg.runnable_avg_period);
575 P(se.avg.load_avg_contrib);
576 P(se.avg.decay_count);
577#endif
569 P(policy); 578 P(policy);
570 P(prio); 579 P(prio);
571#undef PN 580#undef PN
@@ -579,7 +588,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
579 588
580 t0 = cpu_clock(this_cpu); 589 t0 = cpu_clock(this_cpu);
581 t1 = cpu_clock(this_cpu); 590 t1 = cpu_clock(this_cpu);
582 SEQ_printf(m, "%-35s:%21Ld\n", 591 SEQ_printf(m, "%-45s:%21Ld\n",
583 "clock-delta", (long long)(t1-t0)); 592 "clock-delta", (long long)(t1-t0));
584 } 593 }
585} 594}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c61a614465c8..68f1609ca149 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -113,6 +113,24 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
113unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; 113unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
114#endif 114#endif
115 115
116static inline void update_load_add(struct load_weight *lw, unsigned long inc)
117{
118 lw->weight += inc;
119 lw->inv_weight = 0;
120}
121
122static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
123{
124 lw->weight -= dec;
125 lw->inv_weight = 0;
126}
127
128static inline void update_load_set(struct load_weight *lw, unsigned long w)
129{
130 lw->weight = w;
131 lw->inv_weight = 0;
132}
133
116/* 134/*
117 * Increase the granularity value when there are more CPUs, 135 * Increase the granularity value when there are more CPUs,
118 * because with more CPUs the 'effective latency' as visible 136 * because with more CPUs the 'effective latency' as visible
@@ -662,6 +680,26 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
662 return calc_delta_fair(sched_slice(cfs_rq, se), se); 680 return calc_delta_fair(sched_slice(cfs_rq, se), se);
663} 681}
664 682
683#ifdef CONFIG_SMP
684static inline void __update_task_entity_contrib(struct sched_entity *se);
685
686/* Give new task start runnable values to heavy its load in infant time */
687void init_task_runnable_average(struct task_struct *p)
688{
689 u32 slice;
690
691 p->se.avg.decay_count = 0;
692 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
693 p->se.avg.runnable_avg_sum = slice;
694 p->se.avg.runnable_avg_period = slice;
695 __update_task_entity_contrib(&p->se);
696}
697#else
698void init_task_runnable_average(struct task_struct *p)
699{
700}
701#endif
702
665/* 703/*
666 * Update the current task's runtime statistics. Skip current tasks that 704 * Update the current task's runtime statistics. Skip current tasks that
667 * are not in our scheduling class. 705 * are not in our scheduling class.
@@ -686,7 +724,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
686static void update_curr(struct cfs_rq *cfs_rq) 724static void update_curr(struct cfs_rq *cfs_rq)
687{ 725{
688 struct sched_entity *curr = cfs_rq->curr; 726 struct sched_entity *curr = cfs_rq->curr;
689 u64 now = rq_of(cfs_rq)->clock_task; 727 u64 now = rq_clock_task(rq_of(cfs_rq));
690 unsigned long delta_exec; 728 unsigned long delta_exec;
691 729
692 if (unlikely(!curr)) 730 if (unlikely(!curr))
@@ -718,7 +756,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
718static inline void 756static inline void
719update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 757update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
720{ 758{
721 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); 759 schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
722} 760}
723 761
724/* 762/*
@@ -738,14 +776,14 @@ static void
738update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 776update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
739{ 777{
740 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, 778 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
741 rq_of(cfs_rq)->clock - se->statistics.wait_start)); 779 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
742 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); 780 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
743 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + 781 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
744 rq_of(cfs_rq)->clock - se->statistics.wait_start); 782 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
745#ifdef CONFIG_SCHEDSTATS 783#ifdef CONFIG_SCHEDSTATS
746 if (entity_is_task(se)) { 784 if (entity_is_task(se)) {
747 trace_sched_stat_wait(task_of(se), 785 trace_sched_stat_wait(task_of(se),
748 rq_of(cfs_rq)->clock - se->statistics.wait_start); 786 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
749 } 787 }
750#endif 788#endif
751 schedstat_set(se->statistics.wait_start, 0); 789 schedstat_set(se->statistics.wait_start, 0);
@@ -771,7 +809,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
771 /* 809 /*
772 * We are starting a new run period: 810 * We are starting a new run period:
773 */ 811 */
774 se->exec_start = rq_of(cfs_rq)->clock_task; 812 se->exec_start = rq_clock_task(rq_of(cfs_rq));
775} 813}
776 814
777/************************************************** 815/**************************************************
@@ -813,7 +851,7 @@ void task_numa_fault(int node, int pages, bool migrated)
813{ 851{
814 struct task_struct *p = current; 852 struct task_struct *p = current;
815 853
816 if (!sched_feat_numa(NUMA)) 854 if (!numabalancing_enabled)
817 return; 855 return;
818 856
819 /* FIXME: Allocate task-specific structure for placement policy here */ 857 /* FIXME: Allocate task-specific structure for placement policy here */
@@ -1037,7 +1075,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1037 * to gain a more accurate current total weight. See 1075 * to gain a more accurate current total weight. See
1038 * update_cfs_rq_load_contribution(). 1076 * update_cfs_rq_load_contribution().
1039 */ 1077 */
1040 tg_weight = atomic64_read(&tg->load_avg); 1078 tg_weight = atomic_long_read(&tg->load_avg);
1041 tg_weight -= cfs_rq->tg_load_contrib; 1079 tg_weight -= cfs_rq->tg_load_contrib;
1042 tg_weight += cfs_rq->load.weight; 1080 tg_weight += cfs_rq->load.weight;
1043 1081
@@ -1110,8 +1148,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
1110} 1148}
1111#endif /* CONFIG_FAIR_GROUP_SCHED */ 1149#endif /* CONFIG_FAIR_GROUP_SCHED */
1112 1150
1113/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ 1151#ifdef CONFIG_SMP
1114#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1115/* 1152/*
1116 * We choose a half-life close to 1 scheduling period. 1153 * We choose a half-life close to 1 scheduling period.
1117 * Note: The tables below are dependent on this value. 1154 * Note: The tables below are dependent on this value.
@@ -1319,13 +1356,13 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1319 int force_update) 1356 int force_update)
1320{ 1357{
1321 struct task_group *tg = cfs_rq->tg; 1358 struct task_group *tg = cfs_rq->tg;
1322 s64 tg_contrib; 1359 long tg_contrib;
1323 1360
1324 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; 1361 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1325 tg_contrib -= cfs_rq->tg_load_contrib; 1362 tg_contrib -= cfs_rq->tg_load_contrib;
1326 1363
1327 if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { 1364 if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1328 atomic64_add(tg_contrib, &tg->load_avg); 1365 atomic_long_add(tg_contrib, &tg->load_avg);
1329 cfs_rq->tg_load_contrib += tg_contrib; 1366 cfs_rq->tg_load_contrib += tg_contrib;
1330 } 1367 }
1331} 1368}
@@ -1360,8 +1397,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
1360 u64 contrib; 1397 u64 contrib;
1361 1398
1362 contrib = cfs_rq->tg_load_contrib * tg->shares; 1399 contrib = cfs_rq->tg_load_contrib * tg->shares;
1363 se->avg.load_avg_contrib = div64_u64(contrib, 1400 se->avg.load_avg_contrib = div_u64(contrib,
1364 atomic64_read(&tg->load_avg) + 1); 1401 atomic_long_read(&tg->load_avg) + 1);
1365 1402
1366 /* 1403 /*
1367 * For group entities we need to compute a correction term in the case 1404 * For group entities we need to compute a correction term in the case
@@ -1480,8 +1517,9 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1480 if (!decays && !force_update) 1517 if (!decays && !force_update)
1481 return; 1518 return;
1482 1519
1483 if (atomic64_read(&cfs_rq->removed_load)) { 1520 if (atomic_long_read(&cfs_rq->removed_load)) {
1484 u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); 1521 unsigned long removed_load;
1522 removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
1485 subtract_blocked_load_contrib(cfs_rq, removed_load); 1523 subtract_blocked_load_contrib(cfs_rq, removed_load);
1486 } 1524 }
1487 1525
@@ -1497,7 +1535,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1497 1535
1498static inline void update_rq_runnable_avg(struct rq *rq, int runnable) 1536static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1499{ 1537{
1500 __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); 1538 __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
1501 __update_tg_runnable_avg(&rq->avg, &rq->cfs); 1539 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
1502} 1540}
1503 1541
@@ -1510,9 +1548,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1510 * We track migrations using entity decay_count <= 0, on a wake-up 1548 * We track migrations using entity decay_count <= 0, on a wake-up
1511 * migration we use a negative decay count to track the remote decays 1549 * migration we use a negative decay count to track the remote decays
1512 * accumulated while sleeping. 1550 * accumulated while sleeping.
1551 *
1552 * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
1553 * are seen by enqueue_entity_load_avg() as a migration with an already
1554 * constructed load_avg_contrib.
1513 */ 1555 */
1514 if (unlikely(se->avg.decay_count <= 0)) { 1556 if (unlikely(se->avg.decay_count <= 0)) {
1515 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; 1557 se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
1516 if (se->avg.decay_count) { 1558 if (se->avg.decay_count) {
1517 /* 1559 /*
1518 * In a wake-up migration we have to approximate the 1560 * In a wake-up migration we have to approximate the
@@ -1530,7 +1572,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1530 } 1572 }
1531 wakeup = 0; 1573 wakeup = 0;
1532 } else { 1574 } else {
1533 __synchronize_entity_decay(se); 1575 /*
1576 * Task re-woke on same cpu (or else migrate_task_rq_fair()
1577 * would have made count negative); we must be careful to avoid
1578 * double-accounting blocked time after synchronizing decays.
1579 */
1580 se->avg.last_runnable_update += __synchronize_entity_decay(se)
1581 << 20;
1534 } 1582 }
1535 1583
1536 /* migrated tasks did not contribute to our blocked load */ 1584 /* migrated tasks did not contribute to our blocked load */
@@ -1607,7 +1655,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1607 tsk = task_of(se); 1655 tsk = task_of(se);
1608 1656
1609 if (se->statistics.sleep_start) { 1657 if (se->statistics.sleep_start) {
1610 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; 1658 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
1611 1659
1612 if ((s64)delta < 0) 1660 if ((s64)delta < 0)
1613 delta = 0; 1661 delta = 0;
@@ -1624,7 +1672,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1624 } 1672 }
1625 } 1673 }
1626 if (se->statistics.block_start) { 1674 if (se->statistics.block_start) {
1627 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; 1675 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
1628 1676
1629 if ((s64)delta < 0) 1677 if ((s64)delta < 0)
1630 delta = 0; 1678 delta = 0;
@@ -1712,7 +1760,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1712{ 1760{
1713 /* 1761 /*
1714 * Update the normalized vruntime before updating min_vruntime 1762 * Update the normalized vruntime before updating min_vruntime
1715 * through callig update_curr(). 1763 * through calling update_curr().
1716 */ 1764 */
1717 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) 1765 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
1718 se->vruntime += cfs_rq->min_vruntime; 1766 se->vruntime += cfs_rq->min_vruntime;
@@ -1805,9 +1853,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1805 struct task_struct *tsk = task_of(se); 1853 struct task_struct *tsk = task_of(se);
1806 1854
1807 if (tsk->state & TASK_INTERRUPTIBLE) 1855 if (tsk->state & TASK_INTERRUPTIBLE)
1808 se->statistics.sleep_start = rq_of(cfs_rq)->clock; 1856 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
1809 if (tsk->state & TASK_UNINTERRUPTIBLE) 1857 if (tsk->state & TASK_UNINTERRUPTIBLE)
1810 se->statistics.block_start = rq_of(cfs_rq)->clock; 1858 se->statistics.block_start = rq_clock(rq_of(cfs_rq));
1811 } 1859 }
1812#endif 1860#endif
1813 } 1861 }
@@ -1984,6 +2032,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1984 */ 2032 */
1985 update_entity_load_avg(curr, 1); 2033 update_entity_load_avg(curr, 1);
1986 update_cfs_rq_blocked_load(cfs_rq, 1); 2034 update_cfs_rq_blocked_load(cfs_rq, 1);
2035 update_cfs_shares(cfs_rq);
1987 2036
1988#ifdef CONFIG_SCHED_HRTICK 2037#ifdef CONFIG_SCHED_HRTICK
1989 /* 2038 /*
@@ -2082,7 +2131,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2082 if (unlikely(cfs_rq->throttle_count)) 2131 if (unlikely(cfs_rq->throttle_count))
2083 return cfs_rq->throttled_clock_task; 2132 return cfs_rq->throttled_clock_task;
2084 2133
2085 return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; 2134 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
2086} 2135}
2087 2136
2088/* returns 0 on failure to allocate runtime */ 2137/* returns 0 on failure to allocate runtime */
@@ -2138,10 +2187,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2138static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) 2187static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2139{ 2188{
2140 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 2189 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2141 struct rq *rq = rq_of(cfs_rq);
2142 2190
2143 /* if the deadline is ahead of our clock, nothing to do */ 2191 /* if the deadline is ahead of our clock, nothing to do */
2144 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) 2192 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
2145 return; 2193 return;
2146 2194
2147 if (cfs_rq->runtime_remaining < 0) 2195 if (cfs_rq->runtime_remaining < 0)
@@ -2230,7 +2278,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
2230#ifdef CONFIG_SMP 2278#ifdef CONFIG_SMP
2231 if (!cfs_rq->throttle_count) { 2279 if (!cfs_rq->throttle_count) {
2232 /* adjust cfs_rq_clock_task() */ 2280 /* adjust cfs_rq_clock_task() */
2233 cfs_rq->throttled_clock_task_time += rq->clock_task - 2281 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
2234 cfs_rq->throttled_clock_task; 2282 cfs_rq->throttled_clock_task;
2235 } 2283 }
2236#endif 2284#endif
@@ -2245,7 +2293,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
2245 2293
2246 /* group is entering throttled state, stop time */ 2294 /* group is entering throttled state, stop time */
2247 if (!cfs_rq->throttle_count) 2295 if (!cfs_rq->throttle_count)
2248 cfs_rq->throttled_clock_task = rq->clock_task; 2296 cfs_rq->throttled_clock_task = rq_clock_task(rq);
2249 cfs_rq->throttle_count++; 2297 cfs_rq->throttle_count++;
2250 2298
2251 return 0; 2299 return 0;
@@ -2284,7 +2332,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2284 rq->nr_running -= task_delta; 2332 rq->nr_running -= task_delta;
2285 2333
2286 cfs_rq->throttled = 1; 2334 cfs_rq->throttled = 1;
2287 cfs_rq->throttled_clock = rq->clock; 2335 cfs_rq->throttled_clock = rq_clock(rq);
2288 raw_spin_lock(&cfs_b->lock); 2336 raw_spin_lock(&cfs_b->lock);
2289 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 2337 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
2290 raw_spin_unlock(&cfs_b->lock); 2338 raw_spin_unlock(&cfs_b->lock);
@@ -2298,15 +2346,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
2298 int enqueue = 1; 2346 int enqueue = 1;
2299 long task_delta; 2347 long task_delta;
2300 2348
2301 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; 2349 se = cfs_rq->tg->se[cpu_of(rq)];
2302 2350
2303 cfs_rq->throttled = 0; 2351 cfs_rq->throttled = 0;
2352
2353 update_rq_clock(rq);
2354
2304 raw_spin_lock(&cfs_b->lock); 2355 raw_spin_lock(&cfs_b->lock);
2305 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; 2356 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
2306 list_del_rcu(&cfs_rq->throttled_list); 2357 list_del_rcu(&cfs_rq->throttled_list);
2307 raw_spin_unlock(&cfs_b->lock); 2358 raw_spin_unlock(&cfs_b->lock);
2308 2359
2309 update_rq_clock(rq);
2310 /* update hierarchical throttle state */ 2360 /* update hierarchical throttle state */
2311 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); 2361 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
2312 2362
@@ -2599,10 +2649,6 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2599 throttle_cfs_rq(cfs_rq); 2649 throttle_cfs_rq(cfs_rq);
2600} 2650}
2601 2651
2602static inline u64 default_cfs_period(void);
2603static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
2604static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
2605
2606static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) 2652static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
2607{ 2653{
2608 struct cfs_bandwidth *cfs_b = 2654 struct cfs_bandwidth *cfs_b =
@@ -2706,7 +2752,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
2706#else /* CONFIG_CFS_BANDWIDTH */ 2752#else /* CONFIG_CFS_BANDWIDTH */
2707static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) 2753static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2708{ 2754{
2709 return rq_of(cfs_rq)->clock_task; 2755 return rq_clock_task(rq_of(cfs_rq));
2710} 2756}
2711 2757
2712static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 2758static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
@@ -2919,7 +2965,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2919/* Used instead of source_load when we know the type == 0 */ 2965/* Used instead of source_load when we know the type == 0 */
2920static unsigned long weighted_cpuload(const int cpu) 2966static unsigned long weighted_cpuload(const int cpu)
2921{ 2967{
2922 return cpu_rq(cpu)->load.weight; 2968 return cpu_rq(cpu)->cfs.runnable_load_avg;
2923} 2969}
2924 2970
2925/* 2971/*
@@ -2964,9 +3010,10 @@ static unsigned long cpu_avg_load_per_task(int cpu)
2964{ 3010{
2965 struct rq *rq = cpu_rq(cpu); 3011 struct rq *rq = cpu_rq(cpu);
2966 unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 3012 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
3013 unsigned long load_avg = rq->cfs.runnable_load_avg;
2967 3014
2968 if (nr_running) 3015 if (nr_running)
2969 return rq->load.weight / nr_running; 3016 return load_avg / nr_running;
2970 3017
2971 return 0; 3018 return 0;
2972} 3019}
@@ -3416,12 +3463,6 @@ unlock:
3416} 3463}
3417 3464
3418/* 3465/*
3419 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
3420 * removed when useful for applications beyond shares distribution (e.g.
3421 * load-balance).
3422 */
3423#ifdef CONFIG_FAIR_GROUP_SCHED
3424/*
3425 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 3466 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
3426 * cfs_rq_of(p) references at time of call are still valid and identify the 3467 * cfs_rq_of(p) references at time of call are still valid and identify the
3427 * previous cpu. However, the caller only guarantees p->pi_lock is held; no 3468 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
@@ -3441,10 +3482,10 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3441 */ 3482 */
3442 if (se->avg.decay_count) { 3483 if (se->avg.decay_count) {
3443 se->avg.decay_count = -__synchronize_entity_decay(se); 3484 se->avg.decay_count = -__synchronize_entity_decay(se);
3444 atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); 3485 atomic_long_add(se->avg.load_avg_contrib,
3486 &cfs_rq->removed_load);
3445 } 3487 }
3446} 3488}
3447#endif
3448#endif /* CONFIG_SMP */ 3489#endif /* CONFIG_SMP */
3449 3490
3450static unsigned long 3491static unsigned long
@@ -3946,7 +3987,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3946 * 2) too many balance attempts have failed. 3987 * 2) too many balance attempts have failed.
3947 */ 3988 */
3948 3989
3949 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); 3990 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
3950 if (!tsk_cache_hot || 3991 if (!tsk_cache_hot ||
3951 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 3992 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3952 3993
@@ -4141,11 +4182,11 @@ static int tg_load_down(struct task_group *tg, void *data)
4141 long cpu = (long)data; 4182 long cpu = (long)data;
4142 4183
4143 if (!tg->parent) { 4184 if (!tg->parent) {
4144 load = cpu_rq(cpu)->load.weight; 4185 load = cpu_rq(cpu)->avg.load_avg_contrib;
4145 } else { 4186 } else {
4146 load = tg->parent->cfs_rq[cpu]->h_load; 4187 load = tg->parent->cfs_rq[cpu]->h_load;
4147 load *= tg->se[cpu]->load.weight; 4188 load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
4148 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 4189 tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
4149 } 4190 }
4150 4191
4151 tg->cfs_rq[cpu]->h_load = load; 4192 tg->cfs_rq[cpu]->h_load = load;
@@ -4171,12 +4212,9 @@ static void update_h_load(long cpu)
4171static unsigned long task_h_load(struct task_struct *p) 4212static unsigned long task_h_load(struct task_struct *p)
4172{ 4213{
4173 struct cfs_rq *cfs_rq = task_cfs_rq(p); 4214 struct cfs_rq *cfs_rq = task_cfs_rq(p);
4174 unsigned long load;
4175 4215
4176 load = p->se.load.weight; 4216 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
4177 load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1); 4217 cfs_rq->runnable_load_avg + 1);
4178
4179 return load;
4180} 4218}
4181#else 4219#else
4182static inline void update_blocked_averages(int cpu) 4220static inline void update_blocked_averages(int cpu)
@@ -4189,7 +4227,7 @@ static inline void update_h_load(long cpu)
4189 4227
4190static unsigned long task_h_load(struct task_struct *p) 4228static unsigned long task_h_load(struct task_struct *p)
4191{ 4229{
4192 return p->se.load.weight; 4230 return p->se.avg.load_avg_contrib;
4193} 4231}
4194#endif 4232#endif
4195 4233
@@ -4243,6 +4281,8 @@ struct sg_lb_stats {
4243 * get_sd_load_idx - Obtain the load index for a given sched domain. 4281 * get_sd_load_idx - Obtain the load index for a given sched domain.
4244 * @sd: The sched_domain whose load_idx is to be obtained. 4282 * @sd: The sched_domain whose load_idx is to be obtained.
4245 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. 4283 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
4284 *
4285 * Return: The load index.
4246 */ 4286 */
4247static inline int get_sd_load_idx(struct sched_domain *sd, 4287static inline int get_sd_load_idx(struct sched_domain *sd,
4248 enum cpu_idle_type idle) 4288 enum cpu_idle_type idle)
@@ -4302,7 +4342,7 @@ static unsigned long scale_rt_power(int cpu)
4302 age_stamp = ACCESS_ONCE(rq->age_stamp); 4342 age_stamp = ACCESS_ONCE(rq->age_stamp);
4303 avg = ACCESS_ONCE(rq->rt_avg); 4343 avg = ACCESS_ONCE(rq->rt_avg);
4304 4344
4305 total = sched_avg_period() + (rq->clock - age_stamp); 4345 total = sched_avg_period() + (rq_clock(rq) - age_stamp);
4306 4346
4307 if (unlikely(total < avg)) { 4347 if (unlikely(total < avg)) {
4308 /* Ensures that power won't end up being negative */ 4348 /* Ensures that power won't end up being negative */
@@ -4537,6 +4577,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4537 * 4577 *
4538 * Determine if @sg is a busier group than the previously selected 4578 * Determine if @sg is a busier group than the previously selected
4539 * busiest group. 4579 * busiest group.
4580 *
4581 * Return: %true if @sg is a busier group than the previously selected
4582 * busiest group. %false otherwise.
4540 */ 4583 */
4541static bool update_sd_pick_busiest(struct lb_env *env, 4584static bool update_sd_pick_busiest(struct lb_env *env,
4542 struct sd_lb_stats *sds, 4585 struct sd_lb_stats *sds,
@@ -4654,7 +4697,7 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4654 * assuming lower CPU number will be equivalent to lower a SMT thread 4697 * assuming lower CPU number will be equivalent to lower a SMT thread
4655 * number. 4698 * number.
4656 * 4699 *
4657 * Returns 1 when packing is required and a task should be moved to 4700 * Return: 1 when packing is required and a task should be moved to
4658 * this CPU. The amount of the imbalance is returned in *imbalance. 4701 * this CPU. The amount of the imbalance is returned in *imbalance.
4659 * 4702 *
4660 * @env: The load balancing environment. 4703 * @env: The load balancing environment.
@@ -4832,7 +4875,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4832 * @balance: Pointer to a variable indicating if this_cpu 4875 * @balance: Pointer to a variable indicating if this_cpu
4833 * is the appropriate cpu to perform load balancing at this_level. 4876 * is the appropriate cpu to perform load balancing at this_level.
4834 * 4877 *
4835 * Returns: - the busiest group if imbalance exists. 4878 * Return: - The busiest group if imbalance exists.
4836 * - If no imbalance and user has opted for power-savings balance, 4879 * - If no imbalance and user has opted for power-savings balance,
4837 * return the least loaded group whose CPUs can be 4880 * return the least loaded group whose CPUs can be
4838 * put to idle by rebalancing its tasks onto our group. 4881 * put to idle by rebalancing its tasks onto our group.
@@ -5241,7 +5284,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5241 int pulled_task = 0; 5284 int pulled_task = 0;
5242 unsigned long next_balance = jiffies + HZ; 5285 unsigned long next_balance = jiffies + HZ;
5243 5286
5244 this_rq->idle_stamp = this_rq->clock; 5287 this_rq->idle_stamp = rq_clock(this_rq);
5245 5288
5246 if (this_rq->avg_idle < sysctl_sched_migration_cost) 5289 if (this_rq->avg_idle < sysctl_sched_migration_cost)
5247 return; 5290 return;
@@ -5418,10 +5461,9 @@ static inline void nohz_balance_exit_idle(int cpu)
5418static inline void set_cpu_sd_state_busy(void) 5461static inline void set_cpu_sd_state_busy(void)
5419{ 5462{
5420 struct sched_domain *sd; 5463 struct sched_domain *sd;
5421 int cpu = smp_processor_id();
5422 5464
5423 rcu_read_lock(); 5465 rcu_read_lock();
5424 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); 5466 sd = rcu_dereference_check_sched_domain(this_rq()->sd);
5425 5467
5426 if (!sd || !sd->nohz_idle) 5468 if (!sd || !sd->nohz_idle)
5427 goto unlock; 5469 goto unlock;
@@ -5436,10 +5478,9 @@ unlock:
5436void set_cpu_sd_state_idle(void) 5478void set_cpu_sd_state_idle(void)
5437{ 5479{
5438 struct sched_domain *sd; 5480 struct sched_domain *sd;
5439 int cpu = smp_processor_id();
5440 5481
5441 rcu_read_lock(); 5482 rcu_read_lock();
5442 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); 5483 sd = rcu_dereference_check_sched_domain(this_rq()->sd);
5443 5484
5444 if (!sd || sd->nohz_idle) 5485 if (!sd || sd->nohz_idle)
5445 goto unlock; 5486 goto unlock;
@@ -5471,7 +5512,7 @@ void nohz_balance_enter_idle(int cpu)
5471 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 5512 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5472} 5513}
5473 5514
5474static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, 5515static int sched_ilb_notifier(struct notifier_block *nfb,
5475 unsigned long action, void *hcpu) 5516 unsigned long action, void *hcpu)
5476{ 5517{
5477 switch (action & ~CPU_TASKS_FROZEN) { 5518 switch (action & ~CPU_TASKS_FROZEN) {
@@ -5751,7 +5792,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
5751 entity_tick(cfs_rq, se, queued); 5792 entity_tick(cfs_rq, se, queued);
5752 } 5793 }
5753 5794
5754 if (sched_feat_numa(NUMA)) 5795 if (numabalancing_enabled)
5755 task_tick_numa(rq, curr); 5796 task_tick_numa(rq, curr);
5756 5797
5757 update_rq_runnable_avg(rq, 1); 5798 update_rq_runnable_avg(rq, 1);
@@ -5848,7 +5889,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
5848 se->vruntime -= cfs_rq->min_vruntime; 5889 se->vruntime -= cfs_rq->min_vruntime;
5849 } 5890 }
5850 5891
5851#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) 5892#ifdef CONFIG_SMP
5852 /* 5893 /*
5853 * Remove our load from contribution when we leave sched_fair 5894 * Remove our load from contribution when we leave sched_fair
5854 * and ensure we don't carry in an old decay_count if we 5895 * and ensure we don't carry in an old decay_count if we
@@ -5907,9 +5948,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
5907#ifndef CONFIG_64BIT 5948#ifndef CONFIG_64BIT
5908 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 5949 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5909#endif 5950#endif
5910#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) 5951#ifdef CONFIG_SMP
5911 atomic64_set(&cfs_rq->decay_counter, 1); 5952 atomic64_set(&cfs_rq->decay_counter, 1);
5912 atomic64_set(&cfs_rq->removed_load, 0); 5953 atomic_long_set(&cfs_rq->removed_load, 0);
5913#endif 5954#endif
5914} 5955}
5915 5956
@@ -6091,6 +6132,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
6091 se = tg->se[i]; 6132 se = tg->se[i];
6092 /* Propagate contribution to hierarchy */ 6133 /* Propagate contribution to hierarchy */
6093 raw_spin_lock_irqsave(&rq->lock, flags); 6134 raw_spin_lock_irqsave(&rq->lock, flags);
6135
6136 /* Possible calls to update_curr() need rq clock */
6137 update_rq_clock(rq);
6094 for_each_sched_entity(se) 6138 for_each_sched_entity(se)
6095 update_cfs_shares(group_cfs_rq(se)); 6139 update_cfs_shares(group_cfs_rq(se));
6096 raw_spin_unlock_irqrestore(&rq->lock, flags); 6140 raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -6146,9 +6190,8 @@ const struct sched_class fair_sched_class = {
6146 6190
6147#ifdef CONFIG_SMP 6191#ifdef CONFIG_SMP
6148 .select_task_rq = select_task_rq_fair, 6192 .select_task_rq = select_task_rq_fair,
6149#ifdef CONFIG_FAIR_GROUP_SCHED
6150 .migrate_task_rq = migrate_task_rq_fair, 6193 .migrate_task_rq = migrate_task_rq_fair,
6151#endif 6194
6152 .rq_online = rq_online_fair, 6195 .rq_online = rq_online_fair,
6153 .rq_offline = rq_offline_fair, 6196 .rq_offline = rq_offline_fair,
6154 6197
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
new file mode 100644
index 000000000000..16f5a30f9c88
--- /dev/null
+++ b/kernel/sched/proc.c
@@ -0,0 +1,591 @@
1/*
2 * kernel/sched/proc.c
3 *
4 * Kernel load calculations, forked from sched/core.c
5 */
6
7#include <linux/export.h>
8
9#include "sched.h"
10
11unsigned long this_cpu_load(void)
12{
13 struct rq *this = this_rq();
14 return this->cpu_load[0];
15}
16
17
18/*
19 * Global load-average calculations
20 *
21 * We take a distributed and async approach to calculating the global load-avg
22 * in order to minimize overhead.
23 *
24 * The global load average is an exponentially decaying average of nr_running +
25 * nr_uninterruptible.
26 *
27 * Once every LOAD_FREQ:
28 *
29 * nr_active = 0;
30 * for_each_possible_cpu(cpu)
31 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
32 *
33 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
34 *
35 * Due to a number of reasons the above turns in the mess below:
36 *
37 * - for_each_possible_cpu() is prohibitively expensive on machines with
38 * serious number of cpus, therefore we need to take a distributed approach
39 * to calculating nr_active.
40 *
41 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
42 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
43 *
44 * So assuming nr_active := 0 when we start out -- true per definition, we
45 * can simply take per-cpu deltas and fold those into a global accumulate
46 * to obtain the same result. See calc_load_fold_active().
47 *
48 * Furthermore, in order to avoid synchronizing all per-cpu delta folding
49 * across the machine, we assume 10 ticks is sufficient time for every
50 * cpu to have completed this task.
51 *
52 * This places an upper-bound on the IRQ-off latency of the machine. Then
53 * again, being late doesn't loose the delta, just wrecks the sample.
54 *
55 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
56 * this would add another cross-cpu cacheline miss and atomic operation
57 * to the wakeup path. Instead we increment on whatever cpu the task ran
58 * when it went into uninterruptible state and decrement on whatever cpu
59 * did the wakeup. This means that only the sum of nr_uninterruptible over
60 * all cpus yields the correct result.
61 *
62 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
63 */
64
65/* Variables and functions for calc_load */
66atomic_long_t calc_load_tasks;
67unsigned long calc_load_update;
68unsigned long avenrun[3];
69EXPORT_SYMBOL(avenrun); /* should be removed */
70
71/**
72 * get_avenrun - get the load average array
73 * @loads: pointer to dest load array
74 * @offset: offset to add
75 * @shift: shift count to shift the result left
76 *
77 * These values are estimates at best, so no need for locking.
78 */
79void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
80{
81 loads[0] = (avenrun[0] + offset) << shift;
82 loads[1] = (avenrun[1] + offset) << shift;
83 loads[2] = (avenrun[2] + offset) << shift;
84}
85
86long calc_load_fold_active(struct rq *this_rq)
87{
88 long nr_active, delta = 0;
89
90 nr_active = this_rq->nr_running;
91 nr_active += (long) this_rq->nr_uninterruptible;
92
93 if (nr_active != this_rq->calc_load_active) {
94 delta = nr_active - this_rq->calc_load_active;
95 this_rq->calc_load_active = nr_active;
96 }
97
98 return delta;
99}
100
101/*
102 * a1 = a0 * e + a * (1 - e)
103 */
104static unsigned long
105calc_load(unsigned long load, unsigned long exp, unsigned long active)
106{
107 load *= exp;
108 load += active * (FIXED_1 - exp);
109 load += 1UL << (FSHIFT - 1);
110 return load >> FSHIFT;
111}
112
113#ifdef CONFIG_NO_HZ_COMMON
114/*
115 * Handle NO_HZ for the global load-average.
116 *
117 * Since the above described distributed algorithm to compute the global
118 * load-average relies on per-cpu sampling from the tick, it is affected by
119 * NO_HZ.
120 *
121 * The basic idea is to fold the nr_active delta into a global idle-delta upon
122 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
123 * when we read the global state.
124 *
125 * Obviously reality has to ruin such a delightfully simple scheme:
126 *
127 * - When we go NO_HZ idle during the window, we can negate our sample
128 * contribution, causing under-accounting.
129 *
130 * We avoid this by keeping two idle-delta counters and flipping them
131 * when the window starts, thus separating old and new NO_HZ load.
132 *
133 * The only trick is the slight shift in index flip for read vs write.
134 *
135 * 0s 5s 10s 15s
136 * +10 +10 +10 +10
137 * |-|-----------|-|-----------|-|-----------|-|
138 * r:0 0 1 1 0 0 1 1 0
139 * w:0 1 1 0 0 1 1 0 0
140 *
141 * This ensures we'll fold the old idle contribution in this window while
142 * accumlating the new one.
143 *
144 * - When we wake up from NO_HZ idle during the window, we push up our
145 * contribution, since we effectively move our sample point to a known
146 * busy state.
147 *
148 * This is solved by pushing the window forward, and thus skipping the
149 * sample, for this cpu (effectively using the idle-delta for this cpu which
150 * was in effect at the time the window opened). This also solves the issue
151 * of having to deal with a cpu having been in NOHZ idle for multiple
152 * LOAD_FREQ intervals.
153 *
154 * When making the ILB scale, we should try to pull this in as well.
155 */
156static atomic_long_t calc_load_idle[2];
157static int calc_load_idx;
158
159static inline int calc_load_write_idx(void)
160{
161 int idx = calc_load_idx;
162
163 /*
164 * See calc_global_nohz(), if we observe the new index, we also
165 * need to observe the new update time.
166 */
167 smp_rmb();
168
169 /*
170 * If the folding window started, make sure we start writing in the
171 * next idle-delta.
172 */
173 if (!time_before(jiffies, calc_load_update))
174 idx++;
175
176 return idx & 1;
177}
178
179static inline int calc_load_read_idx(void)
180{
181 return calc_load_idx & 1;
182}
183
184void calc_load_enter_idle(void)
185{
186 struct rq *this_rq = this_rq();
187 long delta;
188
189 /*
190 * We're going into NOHZ mode, if there's any pending delta, fold it
191 * into the pending idle delta.
192 */
193 delta = calc_load_fold_active(this_rq);
194 if (delta) {
195 int idx = calc_load_write_idx();
196 atomic_long_add(delta, &calc_load_idle[idx]);
197 }
198}
199
200void calc_load_exit_idle(void)
201{
202 struct rq *this_rq = this_rq();
203
204 /*
205 * If we're still before the sample window, we're done.
206 */
207 if (time_before(jiffies, this_rq->calc_load_update))
208 return;
209
210 /*
211 * We woke inside or after the sample window, this means we're already
212 * accounted through the nohz accounting, so skip the entire deal and
213 * sync up for the next window.
214 */
215 this_rq->calc_load_update = calc_load_update;
216 if (time_before(jiffies, this_rq->calc_load_update + 10))
217 this_rq->calc_load_update += LOAD_FREQ;
218}
219
220static long calc_load_fold_idle(void)
221{
222 int idx = calc_load_read_idx();
223 long delta = 0;
224
225 if (atomic_long_read(&calc_load_idle[idx]))
226 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
227
228 return delta;
229}
230
231/**
232 * fixed_power_int - compute: x^n, in O(log n) time
233 *
234 * @x: base of the power
235 * @frac_bits: fractional bits of @x
236 * @n: power to raise @x to.
237 *
238 * By exploiting the relation between the definition of the natural power
239 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
240 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
241 * (where: n_i \elem {0, 1}, the binary vector representing n),
242 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
243 * of course trivially computable in O(log_2 n), the length of our binary
244 * vector.
245 */
246static unsigned long
247fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
248{
249 unsigned long result = 1UL << frac_bits;
250
251 if (n) for (;;) {
252 if (n & 1) {
253 result *= x;
254 result += 1UL << (frac_bits - 1);
255 result >>= frac_bits;
256 }
257 n >>= 1;
258 if (!n)
259 break;
260 x *= x;
261 x += 1UL << (frac_bits - 1);
262 x >>= frac_bits;
263 }
264
265 return result;
266}
267
268/*
269 * a1 = a0 * e + a * (1 - e)
270 *
271 * a2 = a1 * e + a * (1 - e)
272 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
273 * = a0 * e^2 + a * (1 - e) * (1 + e)
274 *
275 * a3 = a2 * e + a * (1 - e)
276 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
277 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
278 *
279 * ...
280 *
281 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
282 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
283 * = a0 * e^n + a * (1 - e^n)
284 *
285 * [1] application of the geometric series:
286 *
287 * n 1 - x^(n+1)
288 * S_n := \Sum x^i = -------------
289 * i=0 1 - x
290 */
291static unsigned long
292calc_load_n(unsigned long load, unsigned long exp,
293 unsigned long active, unsigned int n)
294{
295
296 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
297}
298
299/*
300 * NO_HZ can leave us missing all per-cpu ticks calling
301 * calc_load_account_active(), but since an idle CPU folds its delta into
302 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
303 * in the pending idle delta if our idle period crossed a load cycle boundary.
304 *
305 * Once we've updated the global active value, we need to apply the exponential
306 * weights adjusted to the number of cycles missed.
307 */
308static void calc_global_nohz(void)
309{
310 long delta, active, n;
311
312 if (!time_before(jiffies, calc_load_update + 10)) {
313 /*
314 * Catch-up, fold however many we are behind still
315 */
316 delta = jiffies - calc_load_update - 10;
317 n = 1 + (delta / LOAD_FREQ);
318
319 active = atomic_long_read(&calc_load_tasks);
320 active = active > 0 ? active * FIXED_1 : 0;
321
322 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
323 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
324 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
325
326 calc_load_update += n * LOAD_FREQ;
327 }
328
329 /*
330 * Flip the idle index...
331 *
332 * Make sure we first write the new time then flip the index, so that
333 * calc_load_write_idx() will see the new time when it reads the new
334 * index, this avoids a double flip messing things up.
335 */
336 smp_wmb();
337 calc_load_idx++;
338}
339#else /* !CONFIG_NO_HZ_COMMON */
340
341static inline long calc_load_fold_idle(void) { return 0; }
342static inline void calc_global_nohz(void) { }
343
344#endif /* CONFIG_NO_HZ_COMMON */
345
346/*
347 * calc_load - update the avenrun load estimates 10 ticks after the
348 * CPUs have updated calc_load_tasks.
349 */
350void calc_global_load(unsigned long ticks)
351{
352 long active, delta;
353
354 if (time_before(jiffies, calc_load_update + 10))
355 return;
356
357 /*
358 * Fold the 'old' idle-delta to include all NO_HZ cpus.
359 */
360 delta = calc_load_fold_idle();
361 if (delta)
362 atomic_long_add(delta, &calc_load_tasks);
363
364 active = atomic_long_read(&calc_load_tasks);
365 active = active > 0 ? active * FIXED_1 : 0;
366
367 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
368 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
369 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
370
371 calc_load_update += LOAD_FREQ;
372
373 /*
374 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
375 */
376 calc_global_nohz();
377}
378
379/*
380 * Called from update_cpu_load() to periodically update this CPU's
381 * active count.
382 */
383static void calc_load_account_active(struct rq *this_rq)
384{
385 long delta;
386
387 if (time_before(jiffies, this_rq->calc_load_update))
388 return;
389
390 delta = calc_load_fold_active(this_rq);
391 if (delta)
392 atomic_long_add(delta, &calc_load_tasks);
393
394 this_rq->calc_load_update += LOAD_FREQ;
395}
396
397/*
398 * End of global load-average stuff
399 */
400
401/*
402 * The exact cpuload at various idx values, calculated at every tick would be
403 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
404 *
405 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
406 * on nth tick when cpu may be busy, then we have:
407 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
408 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
409 *
410 * decay_load_missed() below does efficient calculation of
411 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
412 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
413 *
414 * The calculation is approximated on a 128 point scale.
415 * degrade_zero_ticks is the number of ticks after which load at any
416 * particular idx is approximated to be zero.
417 * degrade_factor is a precomputed table, a row for each load idx.
418 * Each column corresponds to degradation factor for a power of two ticks,
419 * based on 128 point scale.
420 * Example:
421 * row 2, col 3 (=12) says that the degradation at load idx 2 after
422 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
423 *
424 * With this power of 2 load factors, we can degrade the load n times
425 * by looking at 1 bits in n and doing as many mult/shift instead of
426 * n mult/shifts needed by the exact degradation.
427 */
428#define DEGRADE_SHIFT 7
429static const unsigned char
430 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
431static const unsigned char
432 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
433 {0, 0, 0, 0, 0, 0, 0, 0},
434 {64, 32, 8, 0, 0, 0, 0, 0},
435 {96, 72, 40, 12, 1, 0, 0},
436 {112, 98, 75, 43, 15, 1, 0},
437 {120, 112, 98, 76, 45, 16, 2} };
438
439/*
440 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
441 * would be when CPU is idle and so we just decay the old load without
442 * adding any new load.
443 */
444static unsigned long
445decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
446{
447 int j = 0;
448
449 if (!missed_updates)
450 return load;
451
452 if (missed_updates >= degrade_zero_ticks[idx])
453 return 0;
454
455 if (idx == 1)
456 return load >> missed_updates;
457
458 while (missed_updates) {
459 if (missed_updates % 2)
460 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
461
462 missed_updates >>= 1;
463 j++;
464 }
465 return load;
466}
467
468/*
469 * Update rq->cpu_load[] statistics. This function is usually called every
470 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
471 * every tick. We fix it up based on jiffies.
472 */
473static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
474 unsigned long pending_updates)
475{
476 int i, scale;
477
478 this_rq->nr_load_updates++;
479
480 /* Update our load: */
481 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
482 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
483 unsigned long old_load, new_load;
484
485 /* scale is effectively 1 << i now, and >> i divides by scale */
486
487 old_load = this_rq->cpu_load[i];
488 old_load = decay_load_missed(old_load, pending_updates - 1, i);
489 new_load = this_load;
490 /*
491 * Round up the averaging division if load is increasing. This
492 * prevents us from getting stuck on 9 if the load is 10, for
493 * example.
494 */
495 if (new_load > old_load)
496 new_load += scale - 1;
497
498 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
499 }
500
501 sched_avg_update(this_rq);
502}
503
504#ifdef CONFIG_SMP
505static inline unsigned long get_rq_runnable_load(struct rq *rq)
506{
507 return rq->cfs.runnable_load_avg;
508}
509#else
510static inline unsigned long get_rq_runnable_load(struct rq *rq)
511{
512 return rq->load.weight;
513}
514#endif
515
516#ifdef CONFIG_NO_HZ_COMMON
517/*
518 * There is no sane way to deal with nohz on smp when using jiffies because the
519 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
520 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
521 *
522 * Therefore we cannot use the delta approach from the regular tick since that
523 * would seriously skew the load calculation. However we'll make do for those
524 * updates happening while idle (nohz_idle_balance) or coming out of idle
525 * (tick_nohz_idle_exit).
526 *
527 * This means we might still be one tick off for nohz periods.
528 */
529
530/*
531 * Called from nohz_idle_balance() to update the load ratings before doing the
532 * idle balance.
533 */
534void update_idle_cpu_load(struct rq *this_rq)
535{
536 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
537 unsigned long load = get_rq_runnable_load(this_rq);
538 unsigned long pending_updates;
539
540 /*
541 * bail if there's load or we're actually up-to-date.
542 */
543 if (load || curr_jiffies == this_rq->last_load_update_tick)
544 return;
545
546 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
547 this_rq->last_load_update_tick = curr_jiffies;
548
549 __update_cpu_load(this_rq, load, pending_updates);
550}
551
552/*
553 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
554 */
555void update_cpu_load_nohz(void)
556{
557 struct rq *this_rq = this_rq();
558 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
559 unsigned long pending_updates;
560
561 if (curr_jiffies == this_rq->last_load_update_tick)
562 return;
563
564 raw_spin_lock(&this_rq->lock);
565 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
566 if (pending_updates) {
567 this_rq->last_load_update_tick = curr_jiffies;
568 /*
569 * We were idle, this means load 0, the current load might be
570 * !0 due to remote wakeups and the sort.
571 */
572 __update_cpu_load(this_rq, 0, pending_updates);
573 }
574 raw_spin_unlock(&this_rq->lock);
575}
576#endif /* CONFIG_NO_HZ */
577
578/*
579 * Called from scheduler_tick()
580 */
581void update_cpu_load_active(struct rq *this_rq)
582{
583 unsigned long load = get_rq_runnable_load(this_rq);
584 /*
585 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
586 */
587 this_rq->last_load_update_tick = jiffies;
588 __update_cpu_load(this_rq, load, 1);
589
590 calc_load_account_active(this_rq);
591}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 127a2c4cf4ab..01970c8e64df 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -399,20 +399,6 @@ static inline struct task_group *next_task_group(struct task_group *tg)
399 (iter = next_task_group(iter)) && \ 399 (iter = next_task_group(iter)) && \
400 (rt_rq = iter->rt_rq[cpu_of(rq)]);) 400 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
401 401
402static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
403{
404 list_add_rcu(&rt_rq->leaf_rt_rq_list,
405 &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
406}
407
408static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
409{
410 list_del_rcu(&rt_rq->leaf_rt_rq_list);
411}
412
413#define for_each_leaf_rt_rq(rt_rq, rq) \
414 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
415
416#define for_each_sched_rt_entity(rt_se) \ 402#define for_each_sched_rt_entity(rt_se) \
417 for (; rt_se; rt_se = rt_se->parent) 403 for (; rt_se; rt_se = rt_se->parent)
418 404
@@ -472,7 +458,7 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
472#ifdef CONFIG_SMP 458#ifdef CONFIG_SMP
473static inline const struct cpumask *sched_rt_period_mask(void) 459static inline const struct cpumask *sched_rt_period_mask(void)
474{ 460{
475 return cpu_rq(smp_processor_id())->rd->span; 461 return this_rq()->rd->span;
476} 462}
477#else 463#else
478static inline const struct cpumask *sched_rt_period_mask(void) 464static inline const struct cpumask *sched_rt_period_mask(void)
@@ -509,17 +495,6 @@ typedef struct rt_rq *rt_rq_iter_t;
509#define for_each_rt_rq(rt_rq, iter, rq) \ 495#define for_each_rt_rq(rt_rq, iter, rq) \
510 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 496 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
511 497
512static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
513{
514}
515
516static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
517{
518}
519
520#define for_each_leaf_rt_rq(rt_rq, rq) \
521 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
522
523#define for_each_sched_rt_entity(rt_se) \ 498#define for_each_sched_rt_entity(rt_se) \
524 for (; rt_se; rt_se = NULL) 499 for (; rt_se; rt_se = NULL)
525 500
@@ -699,15 +674,6 @@ balanced:
699 } 674 }
700} 675}
701 676
702static void disable_runtime(struct rq *rq)
703{
704 unsigned long flags;
705
706 raw_spin_lock_irqsave(&rq->lock, flags);
707 __disable_runtime(rq);
708 raw_spin_unlock_irqrestore(&rq->lock, flags);
709}
710
711static void __enable_runtime(struct rq *rq) 677static void __enable_runtime(struct rq *rq)
712{ 678{
713 rt_rq_iter_t iter; 679 rt_rq_iter_t iter;
@@ -732,37 +698,6 @@ static void __enable_runtime(struct rq *rq)
732 } 698 }
733} 699}
734 700
735static void enable_runtime(struct rq *rq)
736{
737 unsigned long flags;
738
739 raw_spin_lock_irqsave(&rq->lock, flags);
740 __enable_runtime(rq);
741 raw_spin_unlock_irqrestore(&rq->lock, flags);
742}
743
744int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
745{
746 int cpu = (int)(long)hcpu;
747
748 switch (action) {
749 case CPU_DOWN_PREPARE:
750 case CPU_DOWN_PREPARE_FROZEN:
751 disable_runtime(cpu_rq(cpu));
752 return NOTIFY_OK;
753
754 case CPU_DOWN_FAILED:
755 case CPU_DOWN_FAILED_FROZEN:
756 case CPU_ONLINE:
757 case CPU_ONLINE_FROZEN:
758 enable_runtime(cpu_rq(cpu));
759 return NOTIFY_OK;
760
761 default:
762 return NOTIFY_DONE;
763 }
764}
765
766static int balance_runtime(struct rt_rq *rt_rq) 701static int balance_runtime(struct rt_rq *rt_rq)
767{ 702{
768 int more = 0; 703 int more = 0;
@@ -926,7 +861,7 @@ static void update_curr_rt(struct rq *rq)
926 if (curr->sched_class != &rt_sched_class) 861 if (curr->sched_class != &rt_sched_class)
927 return; 862 return;
928 863
929 delta_exec = rq->clock_task - curr->se.exec_start; 864 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
930 if (unlikely((s64)delta_exec <= 0)) 865 if (unlikely((s64)delta_exec <= 0))
931 return; 866 return;
932 867
@@ -936,7 +871,7 @@ static void update_curr_rt(struct rq *rq)
936 curr->se.sum_exec_runtime += delta_exec; 871 curr->se.sum_exec_runtime += delta_exec;
937 account_group_exec_runtime(curr, delta_exec); 872 account_group_exec_runtime(curr, delta_exec);
938 873
939 curr->se.exec_start = rq->clock_task; 874 curr->se.exec_start = rq_clock_task(rq);
940 cpuacct_charge(curr, delta_exec); 875 cpuacct_charge(curr, delta_exec);
941 876
942 sched_rt_avg_update(rq, delta_exec); 877 sched_rt_avg_update(rq, delta_exec);
@@ -1106,9 +1041,6 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1106 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 1041 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
1107 return; 1042 return;
1108 1043
1109 if (!rt_rq->rt_nr_running)
1110 list_add_leaf_rt_rq(rt_rq);
1111
1112 if (head) 1044 if (head)
1113 list_add(&rt_se->run_list, queue); 1045 list_add(&rt_se->run_list, queue);
1114 else 1046 else
@@ -1128,8 +1060,6 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
1128 __clear_bit(rt_se_prio(rt_se), array->bitmap); 1060 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1129 1061
1130 dec_rt_tasks(rt_se, rt_rq); 1062 dec_rt_tasks(rt_se, rt_rq);
1131 if (!rt_rq->rt_nr_running)
1132 list_del_leaf_rt_rq(rt_rq);
1133} 1063}
1134 1064
1135/* 1065/*
@@ -1385,7 +1315,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1385 } while (rt_rq); 1315 } while (rt_rq);
1386 1316
1387 p = rt_task_of(rt_se); 1317 p = rt_task_of(rt_se);
1388 p->se.exec_start = rq->clock_task; 1318 p->se.exec_start = rq_clock_task(rq);
1389 1319
1390 return p; 1320 return p;
1391} 1321}
@@ -1434,42 +1364,24 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1434 return 0; 1364 return 0;
1435} 1365}
1436 1366
1437/* Return the second highest RT task, NULL otherwise */ 1367/*
1438static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) 1368 * Return the highest pushable rq's task, which is suitable to be executed
1369 * on the cpu, NULL otherwise
1370 */
1371static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1439{ 1372{
1440 struct task_struct *next = NULL; 1373 struct plist_head *head = &rq->rt.pushable_tasks;
1441 struct sched_rt_entity *rt_se; 1374 struct task_struct *p;
1442 struct rt_prio_array *array;
1443 struct rt_rq *rt_rq;
1444 int idx;
1445
1446 for_each_leaf_rt_rq(rt_rq, rq) {
1447 array = &rt_rq->active;
1448 idx = sched_find_first_bit(array->bitmap);
1449next_idx:
1450 if (idx >= MAX_RT_PRIO)
1451 continue;
1452 if (next && next->prio <= idx)
1453 continue;
1454 list_for_each_entry(rt_se, array->queue + idx, run_list) {
1455 struct task_struct *p;
1456 1375
1457 if (!rt_entity_is_task(rt_se)) 1376 if (!has_pushable_tasks(rq))
1458 continue; 1377 return NULL;
1459 1378
1460 p = rt_task_of(rt_se); 1379 plist_for_each_entry(p, head, pushable_tasks) {
1461 if (pick_rt_task(rq, p, cpu)) { 1380 if (pick_rt_task(rq, p, cpu))
1462 next = p; 1381 return p;
1463 break;
1464 }
1465 }
1466 if (!next) {
1467 idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
1468 goto next_idx;
1469 }
1470 } 1382 }
1471 1383
1472 return next; 1384 return NULL;
1473} 1385}
1474 1386
1475static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1387static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
@@ -1743,12 +1655,10 @@ static int pull_rt_task(struct rq *this_rq)
1743 double_lock_balance(this_rq, src_rq); 1655 double_lock_balance(this_rq, src_rq);
1744 1656
1745 /* 1657 /*
1746 * Are there still pullable RT tasks? 1658 * We can pull only a task, which is pushable
1659 * on its rq, and no others.
1747 */ 1660 */
1748 if (src_rq->rt.rt_nr_running <= 1) 1661 p = pick_highest_pushable_task(src_rq, this_cpu);
1749 goto skip;
1750
1751 p = pick_next_highest_task_rt(src_rq, this_cpu);
1752 1662
1753 /* 1663 /*
1754 * Do we have an RT task that preempts 1664 * Do we have an RT task that preempts
@@ -2037,7 +1947,7 @@ static void set_curr_task_rt(struct rq *rq)
2037{ 1947{
2038 struct task_struct *p = rq->curr; 1948 struct task_struct *p = rq->curr;
2039 1949
2040 p->se.exec_start = rq->clock_task; 1950 p->se.exec_start = rq_clock_task(rq);
2041 1951
2042 /* The running task is never eligible for pushing */ 1952 /* The running task is never eligible for pushing */
2043 dequeue_pushable_task(rq, p); 1953 dequeue_pushable_task(rq, p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ce39224d6155..ef0a7b2439dd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -10,8 +10,16 @@
10#include "cpupri.h" 10#include "cpupri.h"
11#include "cpuacct.h" 11#include "cpuacct.h"
12 12
13struct rq;
14
13extern __read_mostly int scheduler_running; 15extern __read_mostly int scheduler_running;
14 16
17extern unsigned long calc_load_update;
18extern atomic_long_t calc_load_tasks;
19
20extern long calc_load_fold_active(struct rq *this_rq);
21extern void update_cpu_load_active(struct rq *this_rq);
22
15/* 23/*
16 * Convert user-nice values [ -20 ... 0 ... 19 ] 24 * Convert user-nice values [ -20 ... 0 ... 19 ]
17 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 25 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -140,10 +148,11 @@ struct task_group {
140 struct cfs_rq **cfs_rq; 148 struct cfs_rq **cfs_rq;
141 unsigned long shares; 149 unsigned long shares;
142 150
143 atomic_t load_weight; 151#ifdef CONFIG_SMP
144 atomic64_t load_avg; 152 atomic_long_t load_avg;
145 atomic_t runnable_avg; 153 atomic_t runnable_avg;
146#endif 154#endif
155#endif
147 156
148#ifdef CONFIG_RT_GROUP_SCHED 157#ifdef CONFIG_RT_GROUP_SCHED
149 struct sched_rt_entity **rt_se; 158 struct sched_rt_entity **rt_se;
@@ -261,26 +270,21 @@ struct cfs_rq {
261#endif 270#endif
262 271
263#ifdef CONFIG_SMP 272#ifdef CONFIG_SMP
264/*
265 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
266 * removed when useful for applications beyond shares distribution (e.g.
267 * load-balance).
268 */
269#ifdef CONFIG_FAIR_GROUP_SCHED
270 /* 273 /*
271 * CFS Load tracking 274 * CFS Load tracking
272 * Under CFS, load is tracked on a per-entity basis and aggregated up. 275 * Under CFS, load is tracked on a per-entity basis and aggregated up.
273 * This allows for the description of both thread and group usage (in 276 * This allows for the description of both thread and group usage (in
274 * the FAIR_GROUP_SCHED case). 277 * the FAIR_GROUP_SCHED case).
275 */ 278 */
276 u64 runnable_load_avg, blocked_load_avg; 279 unsigned long runnable_load_avg, blocked_load_avg;
277 atomic64_t decay_counter, removed_load; 280 atomic64_t decay_counter;
278 u64 last_decay; 281 u64 last_decay;
279#endif /* CONFIG_FAIR_GROUP_SCHED */ 282 atomic_long_t removed_load;
280/* These always depend on CONFIG_FAIR_GROUP_SCHED */ 283
281#ifdef CONFIG_FAIR_GROUP_SCHED 284#ifdef CONFIG_FAIR_GROUP_SCHED
285 /* Required to track per-cpu representation of a task_group */
282 u32 tg_runnable_contrib; 286 u32 tg_runnable_contrib;
283 u64 tg_load_contrib; 287 unsigned long tg_load_contrib;
284#endif /* CONFIG_FAIR_GROUP_SCHED */ 288#endif /* CONFIG_FAIR_GROUP_SCHED */
285 289
286 /* 290 /*
@@ -353,7 +357,6 @@ struct rt_rq {
353 unsigned long rt_nr_boosted; 357 unsigned long rt_nr_boosted;
354 358
355 struct rq *rq; 359 struct rq *rq;
356 struct list_head leaf_rt_rq_list;
357 struct task_group *tg; 360 struct task_group *tg;
358#endif 361#endif
359}; 362};
@@ -540,6 +543,16 @@ DECLARE_PER_CPU(struct rq, runqueues);
540#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 543#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
541#define raw_rq() (&__raw_get_cpu_var(runqueues)) 544#define raw_rq() (&__raw_get_cpu_var(runqueues))
542 545
546static inline u64 rq_clock(struct rq *rq)
547{
548 return rq->clock;
549}
550
551static inline u64 rq_clock_task(struct rq *rq)
552{
553 return rq->clock_task;
554}
555
543#ifdef CONFIG_SMP 556#ifdef CONFIG_SMP
544 557
545#define rcu_dereference_check_sched_domain(p) \ 558#define rcu_dereference_check_sched_domain(p) \
@@ -884,24 +897,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
884#define WF_FORK 0x02 /* child wakeup after fork */ 897#define WF_FORK 0x02 /* child wakeup after fork */
885#define WF_MIGRATED 0x4 /* internal use, task got migrated */ 898#define WF_MIGRATED 0x4 /* internal use, task got migrated */
886 899
887static inline void update_load_add(struct load_weight *lw, unsigned long inc)
888{
889 lw->weight += inc;
890 lw->inv_weight = 0;
891}
892
893static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
894{
895 lw->weight -= dec;
896 lw->inv_weight = 0;
897}
898
899static inline void update_load_set(struct load_weight *lw, unsigned long w)
900{
901 lw->weight = w;
902 lw->inv_weight = 0;
903}
904
905/* 900/*
906 * To aid in avoiding the subversion of "niceness" due to uneven distribution 901 * To aid in avoiding the subversion of "niceness" due to uneven distribution
907 * of tasks with abnormal "nice" values across CPUs the contribution that 902 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1028,17 +1023,8 @@ extern void update_group_power(struct sched_domain *sd, int cpu);
1028extern void trigger_load_balance(struct rq *rq, int cpu); 1023extern void trigger_load_balance(struct rq *rq, int cpu);
1029extern void idle_balance(int this_cpu, struct rq *this_rq); 1024extern void idle_balance(int this_cpu, struct rq *this_rq);
1030 1025
1031/*
1032 * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
1033 * becomes useful in lb
1034 */
1035#if defined(CONFIG_FAIR_GROUP_SCHED)
1036extern void idle_enter_fair(struct rq *this_rq); 1026extern void idle_enter_fair(struct rq *this_rq);
1037extern void idle_exit_fair(struct rq *this_rq); 1027extern void idle_exit_fair(struct rq *this_rq);
1038#else
1039static inline void idle_enter_fair(struct rq *this_rq) {}
1040static inline void idle_exit_fair(struct rq *this_rq) {}
1041#endif
1042 1028
1043#else /* CONFIG_SMP */ 1029#else /* CONFIG_SMP */
1044 1030
@@ -1051,7 +1037,6 @@ static inline void idle_balance(int cpu, struct rq *rq)
1051extern void sysrq_sched_debug_show(void); 1037extern void sysrq_sched_debug_show(void);
1052extern void sched_init_granularity(void); 1038extern void sched_init_granularity(void);
1053extern void update_max_interval(void); 1039extern void update_max_interval(void);
1054extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
1055extern void init_sched_rt_class(void); 1040extern void init_sched_rt_class(void);
1056extern void init_sched_fair_class(void); 1041extern void init_sched_fair_class(void);
1057 1042
@@ -1063,6 +1048,8 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
1063 1048
1064extern void update_idle_cpu_load(struct rq *this_rq); 1049extern void update_idle_cpu_load(struct rq *this_rq);
1065 1050
1051extern void init_task_runnable_average(struct task_struct *p);
1052
1066#ifdef CONFIG_PARAVIRT 1053#ifdef CONFIG_PARAVIRT
1067static inline u64 steal_ticks(u64 steal) 1054static inline u64 steal_ticks(u64 steal)
1068{ 1055{
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 2ef90a51ec5e..5aef494fc8b4 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -61,7 +61,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
61 */ 61 */
62static inline void sched_info_dequeued(struct task_struct *t) 62static inline void sched_info_dequeued(struct task_struct *t)
63{ 63{
64 unsigned long long now = task_rq(t)->clock, delta = 0; 64 unsigned long long now = rq_clock(task_rq(t)), delta = 0;
65 65
66 if (unlikely(sched_info_on())) 66 if (unlikely(sched_info_on()))
67 if (t->sched_info.last_queued) 67 if (t->sched_info.last_queued)
@@ -79,7 +79,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
79 */ 79 */
80static void sched_info_arrive(struct task_struct *t) 80static void sched_info_arrive(struct task_struct *t)
81{ 81{
82 unsigned long long now = task_rq(t)->clock, delta = 0; 82 unsigned long long now = rq_clock(task_rq(t)), delta = 0;
83 83
84 if (t->sched_info.last_queued) 84 if (t->sched_info.last_queued)
85 delta = now - t->sched_info.last_queued; 85 delta = now - t->sched_info.last_queued;
@@ -100,7 +100,7 @@ static inline void sched_info_queued(struct task_struct *t)
100{ 100{
101 if (unlikely(sched_info_on())) 101 if (unlikely(sched_info_on()))
102 if (!t->sched_info.last_queued) 102 if (!t->sched_info.last_queued)
103 t->sched_info.last_queued = task_rq(t)->clock; 103 t->sched_info.last_queued = rq_clock(task_rq(t));
104} 104}
105 105
106/* 106/*
@@ -112,7 +112,7 @@ static inline void sched_info_queued(struct task_struct *t)
112 */ 112 */
113static inline void sched_info_depart(struct task_struct *t) 113static inline void sched_info_depart(struct task_struct *t)
114{ 114{
115 unsigned long long delta = task_rq(t)->clock - 115 unsigned long long delta = rq_clock(task_rq(t)) -
116 t->sched_info.last_arrival; 116 t->sched_info.last_arrival;
117 117
118 rq_sched_info_depart(task_rq(t), delta); 118 rq_sched_info_depart(task_rq(t), delta);
@@ -162,6 +162,39 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
162 */ 162 */
163 163
164/** 164/**
165 * cputimer_running - return true if cputimer is running
166 *
167 * @tsk: Pointer to target task.
168 */
169static inline bool cputimer_running(struct task_struct *tsk)
170
171{
172 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
173
174 if (!cputimer->running)
175 return false;
176
177 /*
178 * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
179 * in __exit_signal(), we won't account to the signal struct further
180 * cputime consumed by that task, even though the task can still be
181 * ticking after __exit_signal().
182 *
183 * In order to keep a consistent behaviour between thread group cputime
184 * and thread group cputimer accounting, lets also ignore the cputime
185 * elapsing after __exit_signal() in any thread group timer running.
186 *
187 * This makes sure that POSIX CPU clocks and timers are synchronized, so
188 * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
189 * clock delta is behind the expiring timer value.
190 */
191 if (unlikely(!tsk->sighand))
192 return false;
193
194 return true;
195}
196
197/**
165 * account_group_user_time - Maintain utime for a thread group. 198 * account_group_user_time - Maintain utime for a thread group.
166 * 199 *
167 * @tsk: Pointer to task structure. 200 * @tsk: Pointer to task structure.
@@ -176,7 +209,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
176{ 209{
177 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 210 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
178 211
179 if (!cputimer->running) 212 if (!cputimer_running(tsk))
180 return; 213 return;
181 214
182 raw_spin_lock(&cputimer->lock); 215 raw_spin_lock(&cputimer->lock);
@@ -199,7 +232,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
199{ 232{
200 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 233 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
201 234
202 if (!cputimer->running) 235 if (!cputimer_running(tsk))
203 return; 236 return;
204 237
205 raw_spin_lock(&cputimer->lock); 238 raw_spin_lock(&cputimer->lock);
@@ -222,7 +255,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
222{ 255{
223 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 256 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
224 257
225 if (!cputimer->running) 258 if (!cputimer_running(tsk))
226 return; 259 return;
227 260
228 raw_spin_lock(&cputimer->lock); 261 raw_spin_lock(&cputimer->lock);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index da5eb5bed84a..e08fbeeb54b9 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -28,7 +28,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
28 struct task_struct *stop = rq->stop; 28 struct task_struct *stop = rq->stop;
29 29
30 if (stop && stop->on_rq) { 30 if (stop && stop->on_rq) {
31 stop->se.exec_start = rq->clock_task; 31 stop->se.exec_start = rq_clock_task(rq);
32 return stop; 32 return stop;
33 } 33 }
34 34
@@ -57,7 +57,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
57 struct task_struct *curr = rq->curr; 57 struct task_struct *curr = rq->curr;
58 u64 delta_exec; 58 u64 delta_exec;
59 59
60 delta_exec = rq->clock_task - curr->se.exec_start; 60 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
61 if (unlikely((s64)delta_exec < 0)) 61 if (unlikely((s64)delta_exec < 0))
62 delta_exec = 0; 62 delta_exec = 0;
63 63
@@ -67,7 +67,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
67 curr->se.sum_exec_runtime += delta_exec; 67 curr->se.sum_exec_runtime += delta_exec;
68 account_group_exec_runtime(curr, delta_exec); 68 account_group_exec_runtime(curr, delta_exec);
69 69
70 curr->se.exec_start = rq->clock_task; 70 curr->se.exec_start = rq_clock_task(rq);
71 cpuacct_charge(curr, delta_exec); 71 cpuacct_charge(curr, delta_exec);
72} 72}
73 73
@@ -79,7 +79,7 @@ static void set_curr_task_stop(struct rq *rq)
79{ 79{
80 struct task_struct *stop = rq->stop; 80 struct task_struct *stop = rq->stop;
81 81
82 stop->se.exec_start = rq->clock_task; 82 stop->se.exec_start = rq_clock_task(rq);
83} 83}
84 84
85static void switched_to_stop(struct rq *rq, struct task_struct *p) 85static void switched_to_stop(struct rq *rq, struct task_struct *p)
diff --git a/kernel/signal.c b/kernel/signal.c
index 113411bfe8b1..50e41075ac77 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2848,7 +2848,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
2848 recalc_sigpending(); 2848 recalc_sigpending();
2849 spin_unlock_irq(&tsk->sighand->siglock); 2849 spin_unlock_irq(&tsk->sighand->siglock);
2850 2850
2851 timeout = schedule_timeout_interruptible(timeout); 2851 timeout = freezable_schedule_timeout_interruptible(timeout);
2852 2852
2853 spin_lock_irq(&tsk->sighand->siglock); 2853 spin_lock_irq(&tsk->sighand->siglock);
2854 __set_task_blocked(tsk, &tsk->real_blocked); 2854 __set_task_blocked(tsk, &tsk->real_blocked);
diff --git a/kernel/smp.c b/kernel/smp.c
index 4dba0f7b72ad..fe9f773d7114 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -73,7 +73,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
73 return NOTIFY_OK; 73 return NOTIFY_OK;
74} 74}
75 75
76static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { 76static struct notifier_block hotplug_cfd_notifier = {
77 .notifier_call = hotplug_cfd, 77 .notifier_call = hotplug_cfd,
78}; 78};
79 79
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 02fc5c933673..eb89e1807408 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -24,7 +24,7 @@
24 */ 24 */
25static DEFINE_PER_CPU(struct task_struct *, idle_threads); 25static DEFINE_PER_CPU(struct task_struct *, idle_threads);
26 26
27struct task_struct * __cpuinit idle_thread_get(unsigned int cpu) 27struct task_struct *idle_thread_get(unsigned int cpu)
28{ 28{
29 struct task_struct *tsk = per_cpu(idle_threads, cpu); 29 struct task_struct *tsk = per_cpu(idle_threads, cpu);
30 30
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3d6833f125d3..be3d3514c325 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -127,8 +127,7 @@ static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
127 127
128void local_bh_disable(void) 128void local_bh_disable(void)
129{ 129{
130 __local_bh_disable((unsigned long)__builtin_return_address(0), 130 __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET);
131 SOFTIRQ_DISABLE_OFFSET);
132} 131}
133 132
134EXPORT_SYMBOL(local_bh_disable); 133EXPORT_SYMBOL(local_bh_disable);
@@ -139,7 +138,7 @@ static void __local_bh_enable(unsigned int cnt)
139 WARN_ON_ONCE(!irqs_disabled()); 138 WARN_ON_ONCE(!irqs_disabled());
140 139
141 if (softirq_count() == cnt) 140 if (softirq_count() == cnt)
142 trace_softirqs_on((unsigned long)__builtin_return_address(0)); 141 trace_softirqs_on(_RET_IP_);
143 sub_preempt_count(cnt); 142 sub_preempt_count(cnt);
144} 143}
145 144
@@ -184,7 +183,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)
184 183
185void local_bh_enable(void) 184void local_bh_enable(void)
186{ 185{
187 _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); 186 _local_bh_enable_ip(_RET_IP_);
188} 187}
189EXPORT_SYMBOL(local_bh_enable); 188EXPORT_SYMBOL(local_bh_enable);
190 189
@@ -229,8 +228,7 @@ asmlinkage void __do_softirq(void)
229 pending = local_softirq_pending(); 228 pending = local_softirq_pending();
230 account_irq_enter_time(current); 229 account_irq_enter_time(current);
231 230
232 __local_bh_disable((unsigned long)__builtin_return_address(0), 231 __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET);
233 SOFTIRQ_OFFSET);
234 lockdep_softirq_enter(); 232 lockdep_softirq_enter();
235 233
236 cpu = smp_processor_id(); 234 cpu = smp_processor_id();
@@ -701,7 +699,7 @@ void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
701} 699}
702EXPORT_SYMBOL(send_remote_softirq); 700EXPORT_SYMBOL(send_remote_softirq);
703 701
704static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, 702static int remote_softirq_cpu_notify(struct notifier_block *self,
705 unsigned long action, void *hcpu) 703 unsigned long action, void *hcpu)
706{ 704{
707 /* 705 /*
@@ -730,7 +728,7 @@ static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
730 return NOTIFY_OK; 728 return NOTIFY_OK;
731} 729}
732 730
733static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { 731static struct notifier_block remote_softirq_cpu_notifier = {
734 .notifier_call = remote_softirq_cpu_notify, 732 .notifier_call = remote_softirq_cpu_notify,
735}; 733};
736 734
@@ -832,7 +830,7 @@ static void takeover_tasklets(unsigned int cpu)
832} 830}
833#endif /* CONFIG_HOTPLUG_CPU */ 831#endif /* CONFIG_HOTPLUG_CPU */
834 832
835static int __cpuinit cpu_callback(struct notifier_block *nfb, 833static int cpu_callback(struct notifier_block *nfb,
836 unsigned long action, 834 unsigned long action,
837 void *hcpu) 835 void *hcpu)
838{ 836{
@@ -847,7 +845,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
847 return NOTIFY_OK; 845 return NOTIFY_OK;
848} 846}
849 847
850static struct notifier_block __cpuinitdata cpu_nfb = { 848static struct notifier_block cpu_nfb = {
851 .notifier_call = cpu_callback 849 .notifier_call = cpu_callback
852}; 850};
853 851
diff --git a/kernel/sys.c b/kernel/sys.c
index 2bbd9a73b54c..771129b299f8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -116,20 +116,6 @@ EXPORT_SYMBOL(fs_overflowuid);
116EXPORT_SYMBOL(fs_overflowgid); 116EXPORT_SYMBOL(fs_overflowgid);
117 117
118/* 118/*
119 * this indicates whether you can reboot with ctrl-alt-del: the default is yes
120 */
121
122int C_A_D = 1;
123struct pid *cad_pid;
124EXPORT_SYMBOL(cad_pid);
125
126/*
127 * If set, this is used for preparing the system to power off.
128 */
129
130void (*pm_power_off_prepare)(void);
131
132/*
133 * Returns true if current's euid is same as p's uid or euid, 119 * Returns true if current's euid is same as p's uid or euid,
134 * or has CAP_SYS_NICE to p's user_ns. 120 * or has CAP_SYS_NICE to p's user_ns.
135 * 121 *
@@ -308,266 +294,6 @@ out_unlock:
308 return retval; 294 return retval;
309} 295}
310 296
311/**
312 * emergency_restart - reboot the system
313 *
314 * Without shutting down any hardware or taking any locks
315 * reboot the system. This is called when we know we are in
316 * trouble so this is our best effort to reboot. This is
317 * safe to call in interrupt context.
318 */
319void emergency_restart(void)
320{
321 kmsg_dump(KMSG_DUMP_EMERG);
322 machine_emergency_restart();
323}
324EXPORT_SYMBOL_GPL(emergency_restart);
325
326void kernel_restart_prepare(char *cmd)
327{
328 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
329 system_state = SYSTEM_RESTART;
330 usermodehelper_disable();
331 device_shutdown();
332}
333
334/**
335 * register_reboot_notifier - Register function to be called at reboot time
336 * @nb: Info about notifier function to be called
337 *
338 * Registers a function with the list of functions
339 * to be called at reboot time.
340 *
341 * Currently always returns zero, as blocking_notifier_chain_register()
342 * always returns zero.
343 */
344int register_reboot_notifier(struct notifier_block *nb)
345{
346 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
347}
348EXPORT_SYMBOL(register_reboot_notifier);
349
350/**
351 * unregister_reboot_notifier - Unregister previously registered reboot notifier
352 * @nb: Hook to be unregistered
353 *
354 * Unregisters a previously registered reboot
355 * notifier function.
356 *
357 * Returns zero on success, or %-ENOENT on failure.
358 */
359int unregister_reboot_notifier(struct notifier_block *nb)
360{
361 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
362}
363EXPORT_SYMBOL(unregister_reboot_notifier);
364
365/* Add backwards compatibility for stable trees. */
366#ifndef PF_NO_SETAFFINITY
367#define PF_NO_SETAFFINITY PF_THREAD_BOUND
368#endif
369
370static void migrate_to_reboot_cpu(void)
371{
372 /* The boot cpu is always logical cpu 0 */
373 int cpu = 0;
374
375 cpu_hotplug_disable();
376
377 /* Make certain the cpu I'm about to reboot on is online */
378 if (!cpu_online(cpu))
379 cpu = cpumask_first(cpu_online_mask);
380
381 /* Prevent races with other tasks migrating this task */
382 current->flags |= PF_NO_SETAFFINITY;
383
384 /* Make certain I only run on the appropriate processor */
385 set_cpus_allowed_ptr(current, cpumask_of(cpu));
386}
387
388/**
389 * kernel_restart - reboot the system
390 * @cmd: pointer to buffer containing command to execute for restart
391 * or %NULL
392 *
393 * Shutdown everything and perform a clean reboot.
394 * This is not safe to call in interrupt context.
395 */
396void kernel_restart(char *cmd)
397{
398 kernel_restart_prepare(cmd);
399 migrate_to_reboot_cpu();
400 syscore_shutdown();
401 if (!cmd)
402 printk(KERN_EMERG "Restarting system.\n");
403 else
404 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
405 kmsg_dump(KMSG_DUMP_RESTART);
406 machine_restart(cmd);
407}
408EXPORT_SYMBOL_GPL(kernel_restart);
409
410static void kernel_shutdown_prepare(enum system_states state)
411{
412 blocking_notifier_call_chain(&reboot_notifier_list,
413 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
414 system_state = state;
415 usermodehelper_disable();
416 device_shutdown();
417}
418/**
419 * kernel_halt - halt the system
420 *
421 * Shutdown everything and perform a clean system halt.
422 */
423void kernel_halt(void)
424{
425 kernel_shutdown_prepare(SYSTEM_HALT);
426 migrate_to_reboot_cpu();
427 syscore_shutdown();
428 printk(KERN_EMERG "System halted.\n");
429 kmsg_dump(KMSG_DUMP_HALT);
430 machine_halt();
431}
432
433EXPORT_SYMBOL_GPL(kernel_halt);
434
435/**
436 * kernel_power_off - power_off the system
437 *
438 * Shutdown everything and perform a clean system power_off.
439 */
440void kernel_power_off(void)
441{
442 kernel_shutdown_prepare(SYSTEM_POWER_OFF);
443 if (pm_power_off_prepare)
444 pm_power_off_prepare();
445 migrate_to_reboot_cpu();
446 syscore_shutdown();
447 printk(KERN_EMERG "Power down.\n");
448 kmsg_dump(KMSG_DUMP_POWEROFF);
449 machine_power_off();
450}
451EXPORT_SYMBOL_GPL(kernel_power_off);
452
453static DEFINE_MUTEX(reboot_mutex);
454
455/*
456 * Reboot system call: for obvious reasons only root may call it,
457 * and even root needs to set up some magic numbers in the registers
458 * so that some mistake won't make this reboot the whole machine.
459 * You can also set the meaning of the ctrl-alt-del-key here.
460 *
461 * reboot doesn't sync: do that yourself before calling this.
462 */
463SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
464 void __user *, arg)
465{
466 struct pid_namespace *pid_ns = task_active_pid_ns(current);
467 char buffer[256];
468 int ret = 0;
469
470 /* We only trust the superuser with rebooting the system. */
471 if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
472 return -EPERM;
473
474 /* For safety, we require "magic" arguments. */
475 if (magic1 != LINUX_REBOOT_MAGIC1 ||
476 (magic2 != LINUX_REBOOT_MAGIC2 &&
477 magic2 != LINUX_REBOOT_MAGIC2A &&
478 magic2 != LINUX_REBOOT_MAGIC2B &&
479 magic2 != LINUX_REBOOT_MAGIC2C))
480 return -EINVAL;
481
482 /*
483 * If pid namespaces are enabled and the current task is in a child
484 * pid_namespace, the command is handled by reboot_pid_ns() which will
485 * call do_exit().
486 */
487 ret = reboot_pid_ns(pid_ns, cmd);
488 if (ret)
489 return ret;
490
491 /* Instead of trying to make the power_off code look like
492 * halt when pm_power_off is not set do it the easy way.
493 */
494 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
495 cmd = LINUX_REBOOT_CMD_HALT;
496
497 mutex_lock(&reboot_mutex);
498 switch (cmd) {
499 case LINUX_REBOOT_CMD_RESTART:
500 kernel_restart(NULL);
501 break;
502
503 case LINUX_REBOOT_CMD_CAD_ON:
504 C_A_D = 1;
505 break;
506
507 case LINUX_REBOOT_CMD_CAD_OFF:
508 C_A_D = 0;
509 break;
510
511 case LINUX_REBOOT_CMD_HALT:
512 kernel_halt();
513 do_exit(0);
514 panic("cannot halt");
515
516 case LINUX_REBOOT_CMD_POWER_OFF:
517 kernel_power_off();
518 do_exit(0);
519 break;
520
521 case LINUX_REBOOT_CMD_RESTART2:
522 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
523 ret = -EFAULT;
524 break;
525 }
526 buffer[sizeof(buffer) - 1] = '\0';
527
528 kernel_restart(buffer);
529 break;
530
531#ifdef CONFIG_KEXEC
532 case LINUX_REBOOT_CMD_KEXEC:
533 ret = kernel_kexec();
534 break;
535#endif
536
537#ifdef CONFIG_HIBERNATION
538 case LINUX_REBOOT_CMD_SW_SUSPEND:
539 ret = hibernate();
540 break;
541#endif
542
543 default:
544 ret = -EINVAL;
545 break;
546 }
547 mutex_unlock(&reboot_mutex);
548 return ret;
549}
550
551static void deferred_cad(struct work_struct *dummy)
552{
553 kernel_restart(NULL);
554}
555
556/*
557 * This function gets called by ctrl-alt-del - ie the keyboard interrupt.
558 * As it's called within an interrupt, it may NOT sync: the only choice
559 * is whether to reboot at once, or just ignore the ctrl-alt-del.
560 */
561void ctrl_alt_del(void)
562{
563 static DECLARE_WORK(cad_work, deferred_cad);
564
565 if (C_A_D)
566 schedule_work(&cad_work);
567 else
568 kill_cad_pid(SIGINT, 1);
569}
570
571/* 297/*
572 * Unprivileged users may change the real gid to the effective gid 298 * Unprivileged users may change the real gid to the effective gid
573 * or vice versa. (BSD-style) 299 * or vice versa. (BSD-style)
@@ -1309,6 +1035,17 @@ out:
1309 return retval; 1035 return retval;
1310} 1036}
1311 1037
1038static void set_special_pids(struct pid *pid)
1039{
1040 struct task_struct *curr = current->group_leader;
1041
1042 if (task_session(curr) != pid)
1043 change_pid(curr, PIDTYPE_SID, pid);
1044
1045 if (task_pgrp(curr) != pid)
1046 change_pid(curr, PIDTYPE_PGID, pid);
1047}
1048
1312SYSCALL_DEFINE0(setsid) 1049SYSCALL_DEFINE0(setsid)
1313{ 1050{
1314 struct task_struct *group_leader = current->group_leader; 1051 struct task_struct *group_leader = current->group_leader;
@@ -1328,7 +1065,7 @@ SYSCALL_DEFINE0(setsid)
1328 goto out; 1065 goto out;
1329 1066
1330 group_leader->signal->leader = 1; 1067 group_leader->signal->leader = 1;
1331 __set_special_pids(sid); 1068 set_special_pids(sid);
1332 1069
1333 proc_clear_tty(group_leader); 1070 proc_clear_tty(group_leader);
1334 1071
@@ -2281,68 +2018,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
2281 return err ? -EFAULT : 0; 2018 return err ? -EFAULT : 0;
2282} 2019}
2283 2020
2284char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
2285
2286static int __orderly_poweroff(bool force)
2287{
2288 char **argv;
2289 static char *envp[] = {
2290 "HOME=/",
2291 "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
2292 NULL
2293 };
2294 int ret;
2295
2296 argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
2297 if (argv) {
2298 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
2299 argv_free(argv);
2300 } else {
2301 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
2302 __func__, poweroff_cmd);
2303 ret = -ENOMEM;
2304 }
2305
2306 if (ret && force) {
2307 printk(KERN_WARNING "Failed to start orderly shutdown: "
2308 "forcing the issue\n");
2309 /*
2310 * I guess this should try to kick off some daemon to sync and
2311 * poweroff asap. Or not even bother syncing if we're doing an
2312 * emergency shutdown?
2313 */
2314 emergency_sync();
2315 kernel_power_off();
2316 }
2317
2318 return ret;
2319}
2320
2321static bool poweroff_force;
2322
2323static void poweroff_work_func(struct work_struct *work)
2324{
2325 __orderly_poweroff(poweroff_force);
2326}
2327
2328static DECLARE_WORK(poweroff_work, poweroff_work_func);
2329
2330/**
2331 * orderly_poweroff - Trigger an orderly system poweroff
2332 * @force: force poweroff if command execution fails
2333 *
2334 * This may be called from any context to trigger a system shutdown.
2335 * If the orderly shutdown fails, it will force an immediate shutdown.
2336 */
2337int orderly_poweroff(bool force)
2338{
2339 if (force) /* do not override the pending "true" */
2340 poweroff_force = true;
2341 schedule_work(&poweroff_work);
2342 return 0;
2343}
2344EXPORT_SYMBOL_GPL(orderly_poweroff);
2345
2346/** 2021/**
2347 * do_sysinfo - fill in sysinfo struct 2022 * do_sysinfo - fill in sysinfo struct
2348 * @info: pointer to buffer to fill 2023 * @info: pointer to buffer to fill
@@ -2355,8 +2030,7 @@ static int do_sysinfo(struct sysinfo *info)
2355 2030
2356 memset(info, 0, sizeof(struct sysinfo)); 2031 memset(info, 0, sizeof(struct sysinfo));
2357 2032
2358 ktime_get_ts(&tp); 2033 get_monotonic_boottime(&tp);
2359 monotonic_to_bootbased(&tp);
2360 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); 2034 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
2361 2035
2362 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); 2036 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9edcf456e0fc..07f6fc468e17 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -120,7 +120,6 @@ extern int blk_iopoll_enabled;
120/* Constants used for minimum and maximum */ 120/* Constants used for minimum and maximum */
121#ifdef CONFIG_LOCKUP_DETECTOR 121#ifdef CONFIG_LOCKUP_DETECTOR
122static int sixty = 60; 122static int sixty = 60;
123static int neg_one = -1;
124#endif 123#endif
125 124
126static int zero; 125static int zero;
@@ -600,6 +599,13 @@ static struct ctl_table kern_table[] = {
600 .mode = 0644, 599 .mode = 0644,
601 .proc_handler = proc_dointvec, 600 .proc_handler = proc_dointvec,
602 }, 601 },
602 {
603 .procname = "traceoff_on_warning",
604 .data = &__disable_trace_on_warning,
605 .maxlen = sizeof(__disable_trace_on_warning),
606 .mode = 0644,
607 .proc_handler = proc_dointvec,
608 },
603#endif 609#endif
604#ifdef CONFIG_MODULES 610#ifdef CONFIG_MODULES
605 { 611 {
@@ -801,7 +807,7 @@ static struct ctl_table kern_table[] = {
801#if defined(CONFIG_LOCKUP_DETECTOR) 807#if defined(CONFIG_LOCKUP_DETECTOR)
802 { 808 {
803 .procname = "watchdog", 809 .procname = "watchdog",
804 .data = &watchdog_enabled, 810 .data = &watchdog_user_enabled,
805 .maxlen = sizeof (int), 811 .maxlen = sizeof (int),
806 .mode = 0644, 812 .mode = 0644,
807 .proc_handler = proc_dowatchdog, 813 .proc_handler = proc_dowatchdog,
@@ -814,7 +820,7 @@ static struct ctl_table kern_table[] = {
814 .maxlen = sizeof(int), 820 .maxlen = sizeof(int),
815 .mode = 0644, 821 .mode = 0644,
816 .proc_handler = proc_dowatchdog, 822 .proc_handler = proc_dowatchdog,
817 .extra1 = &neg_one, 823 .extra1 = &zero,
818 .extra2 = &sixty, 824 .extra2 = &sixty,
819 }, 825 },
820 { 826 {
@@ -828,7 +834,7 @@ static struct ctl_table kern_table[] = {
828 }, 834 },
829 { 835 {
830 .procname = "nmi_watchdog", 836 .procname = "nmi_watchdog",
831 .data = &watchdog_enabled, 837 .data = &watchdog_user_enabled,
832 .maxlen = sizeof (int), 838 .maxlen = sizeof (int),
833 .mode = 0644, 839 .mode = 0644,
834 .proc_handler = proc_dowatchdog, 840 .proc_handler = proc_dowatchdog,
@@ -1044,6 +1050,15 @@ static struct ctl_table kern_table[] = {
1044 .mode = 0644, 1050 .mode = 0644,
1045 .proc_handler = perf_proc_update_handler, 1051 .proc_handler = perf_proc_update_handler,
1046 }, 1052 },
1053 {
1054 .procname = "perf_cpu_time_max_percent",
1055 .data = &sysctl_perf_cpu_time_max_percent,
1056 .maxlen = sizeof(sysctl_perf_cpu_time_max_percent),
1057 .mode = 0644,
1058 .proc_handler = perf_cpu_time_max_percent_handler,
1059 .extra1 = &zero,
1060 .extra2 = &one_hundred,
1061 },
1047#endif 1062#endif
1048#ifdef CONFIG_KMEMCHECK 1063#ifdef CONFIG_KMEMCHECK
1049 { 1064 {
@@ -2331,7 +2346,11 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
2331 int write, void *data) 2346 int write, void *data)
2332{ 2347{
2333 if (write) { 2348 if (write) {
2334 *valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); 2349 unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
2350
2351 if (jif > INT_MAX)
2352 return 1;
2353 *valp = (int)jif;
2335 } else { 2354 } else {
2336 int val = *valp; 2355 int val = *valp;
2337 unsigned long lval; 2356 unsigned long lval;
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index aea4a9ea6fc8..b609213ca9a2 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -3,7 +3,6 @@
3#include "../fs/xfs/xfs_sysctl.h" 3#include "../fs/xfs/xfs_sysctl.h"
4#include <linux/sunrpc/debug.h> 4#include <linux/sunrpc/debug.h>
5#include <linux/string.h> 5#include <linux/string.h>
6#include <net/ip_vs.h>
7#include <linux/syscalls.h> 6#include <linux/syscalls.h>
8#include <linux/namei.h> 7#include <linux/namei.h>
9#include <linux/mount.h> 8#include <linux/mount.h>
diff --git a/kernel/time.c b/kernel/time.c
index d3617dbd3dca..7c7964c33ae7 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -11,7 +11,7 @@
11 * Modification history kernel/time.c 11 * Modification history kernel/time.c
12 * 12 *
13 * 1993-09-02 Philip Gladstone 13 * 1993-09-02 Philip Gladstone
14 * Created file with time related functions from sched.c and adjtimex() 14 * Created file with time related functions from sched/core.c and adjtimex()
15 * 1993-10-08 Torsten Duwe 15 * 1993-10-08 Torsten Duwe
16 * adjtime interface update and CMOS clock write code 16 * adjtime interface update and CMOS clock write code
17 * 1995-08-13 Torsten Duwe 17 * 1995-08-13 Torsten Duwe
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ff7d9d2ab504..9250130646f5 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -4,6 +4,8 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
6obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o 6obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o
7obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
7obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o 8obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
8obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o 9obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
9obj-$(CONFIG_TIMER_STATS) += timer_stats.o 10obj-$(CONFIG_TIMER_STATS) += timer_stats.o
11obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index f11d83b12949..eec50fcef9e4 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -199,6 +199,13 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
199 199
200} 200}
201 201
202ktime_t alarm_expires_remaining(const struct alarm *alarm)
203{
204 struct alarm_base *base = &alarm_bases[alarm->type];
205 return ktime_sub(alarm->node.expires, base->gettime());
206}
207EXPORT_SYMBOL_GPL(alarm_expires_remaining);
208
202#ifdef CONFIG_RTC_CLASS 209#ifdef CONFIG_RTC_CLASS
203/** 210/**
204 * alarmtimer_suspend - Suspend time callback 211 * alarmtimer_suspend - Suspend time callback
@@ -303,9 +310,10 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
303 alarm->type = type; 310 alarm->type = type;
304 alarm->state = ALARMTIMER_STATE_INACTIVE; 311 alarm->state = ALARMTIMER_STATE_INACTIVE;
305} 312}
313EXPORT_SYMBOL_GPL(alarm_init);
306 314
307/** 315/**
308 * alarm_start - Sets an alarm to fire 316 * alarm_start - Sets an absolute alarm to fire
309 * @alarm: ptr to alarm to set 317 * @alarm: ptr to alarm to set
310 * @start: time to run the alarm 318 * @start: time to run the alarm
311 */ 319 */
@@ -323,6 +331,34 @@ int alarm_start(struct alarm *alarm, ktime_t start)
323 spin_unlock_irqrestore(&base->lock, flags); 331 spin_unlock_irqrestore(&base->lock, flags);
324 return ret; 332 return ret;
325} 333}
334EXPORT_SYMBOL_GPL(alarm_start);
335
336/**
337 * alarm_start_relative - Sets a relative alarm to fire
338 * @alarm: ptr to alarm to set
339 * @start: time relative to now to run the alarm
340 */
341int alarm_start_relative(struct alarm *alarm, ktime_t start)
342{
343 struct alarm_base *base = &alarm_bases[alarm->type];
344
345 start = ktime_add(start, base->gettime());
346 return alarm_start(alarm, start);
347}
348EXPORT_SYMBOL_GPL(alarm_start_relative);
349
350void alarm_restart(struct alarm *alarm)
351{
352 struct alarm_base *base = &alarm_bases[alarm->type];
353 unsigned long flags;
354
355 spin_lock_irqsave(&base->lock, flags);
356 hrtimer_set_expires(&alarm->timer, alarm->node.expires);
357 hrtimer_restart(&alarm->timer);
358 alarmtimer_enqueue(base, alarm);
359 spin_unlock_irqrestore(&base->lock, flags);
360}
361EXPORT_SYMBOL_GPL(alarm_restart);
326 362
327/** 363/**
328 * alarm_try_to_cancel - Tries to cancel an alarm timer 364 * alarm_try_to_cancel - Tries to cancel an alarm timer
@@ -344,6 +380,7 @@ int alarm_try_to_cancel(struct alarm *alarm)
344 spin_unlock_irqrestore(&base->lock, flags); 380 spin_unlock_irqrestore(&base->lock, flags);
345 return ret; 381 return ret;
346} 382}
383EXPORT_SYMBOL_GPL(alarm_try_to_cancel);
347 384
348 385
349/** 386/**
@@ -361,6 +398,7 @@ int alarm_cancel(struct alarm *alarm)
361 cpu_relax(); 398 cpu_relax();
362 } 399 }
363} 400}
401EXPORT_SYMBOL_GPL(alarm_cancel);
364 402
365 403
366u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) 404u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
@@ -393,8 +431,15 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
393 alarm->node.expires = ktime_add(alarm->node.expires, interval); 431 alarm->node.expires = ktime_add(alarm->node.expires, interval);
394 return overrun; 432 return overrun;
395} 433}
434EXPORT_SYMBOL_GPL(alarm_forward);
396 435
436u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
437{
438 struct alarm_base *base = &alarm_bases[alarm->type];
397 439
440 return alarm_forward(alarm, base->gettime(), interval);
441}
442EXPORT_SYMBOL_GPL(alarm_forward_now);
398 443
399 444
400/** 445/**
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index c6d6400ee137..38959c866789 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -15,20 +15,23 @@
15#include <linux/hrtimer.h> 15#include <linux/hrtimer.h>
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/notifier.h>
19#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/device.h>
20 20
21#include "tick-internal.h" 21#include "tick-internal.h"
22 22
23/* The registered clock event devices */ 23/* The registered clock event devices */
24static LIST_HEAD(clockevent_devices); 24static LIST_HEAD(clockevent_devices);
25static LIST_HEAD(clockevents_released); 25static LIST_HEAD(clockevents_released);
26
27/* Notification for clock events */
28static RAW_NOTIFIER_HEAD(clockevents_chain);
29
30/* Protection for the above */ 26/* Protection for the above */
31static DEFINE_RAW_SPINLOCK(clockevents_lock); 27static DEFINE_RAW_SPINLOCK(clockevents_lock);
28/* Protection for unbind operations */
29static DEFINE_MUTEX(clockevents_mutex);
30
31struct ce_unbind {
32 struct clock_event_device *ce;
33 int res;
34};
32 35
33/** 36/**
34 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds 37 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
@@ -232,47 +235,107 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
232 return (rc && force) ? clockevents_program_min_delta(dev) : rc; 235 return (rc && force) ? clockevents_program_min_delta(dev) : rc;
233} 236}
234 237
235/** 238/*
236 * clockevents_register_notifier - register a clock events change listener 239 * Called after a notify add to make devices available which were
240 * released from the notifier call.
237 */ 241 */
238int clockevents_register_notifier(struct notifier_block *nb) 242static void clockevents_notify_released(void)
239{ 243{
240 unsigned long flags; 244 struct clock_event_device *dev;
241 int ret;
242 245
243 raw_spin_lock_irqsave(&clockevents_lock, flags); 246 while (!list_empty(&clockevents_released)) {
244 ret = raw_notifier_chain_register(&clockevents_chain, nb); 247 dev = list_entry(clockevents_released.next,
245 raw_spin_unlock_irqrestore(&clockevents_lock, flags); 248 struct clock_event_device, list);
249 list_del(&dev->list);
250 list_add(&dev->list, &clockevent_devices);
251 tick_check_new_device(dev);
252 }
253}
246 254
247 return ret; 255/*
256 * Try to install a replacement clock event device
257 */
258static int clockevents_replace(struct clock_event_device *ced)
259{
260 struct clock_event_device *dev, *newdev = NULL;
261
262 list_for_each_entry(dev, &clockevent_devices, list) {
263 if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED)
264 continue;
265
266 if (!tick_check_replacement(newdev, dev))
267 continue;
268
269 if (!try_module_get(dev->owner))
270 continue;
271
272 if (newdev)
273 module_put(newdev->owner);
274 newdev = dev;
275 }
276 if (newdev) {
277 tick_install_replacement(newdev);
278 list_del_init(&ced->list);
279 }
280 return newdev ? 0 : -EBUSY;
248} 281}
249 282
250/* 283/*
251 * Notify about a clock event change. Called with clockevents_lock 284 * Called with clockevents_mutex and clockevents_lock held
252 * held.
253 */ 285 */
254static void clockevents_do_notify(unsigned long reason, void *dev) 286static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
255{ 287{
256 raw_notifier_call_chain(&clockevents_chain, reason, dev); 288 /* Fast track. Device is unused */
289 if (ced->mode == CLOCK_EVT_MODE_UNUSED) {
290 list_del_init(&ced->list);
291 return 0;
292 }
293
294 return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY;
257} 295}
258 296
259/* 297/*
260 * Called after a notify add to make devices available which were 298 * SMP function call to unbind a device
261 * released from the notifier call.
262 */ 299 */
263static void clockevents_notify_released(void) 300static void __clockevents_unbind(void *arg)
264{ 301{
265 struct clock_event_device *dev; 302 struct ce_unbind *cu = arg;
303 int res;
304
305 raw_spin_lock(&clockevents_lock);
306 res = __clockevents_try_unbind(cu->ce, smp_processor_id());
307 if (res == -EAGAIN)
308 res = clockevents_replace(cu->ce);
309 cu->res = res;
310 raw_spin_unlock(&clockevents_lock);
311}
266 312
267 while (!list_empty(&clockevents_released)) { 313/*
268 dev = list_entry(clockevents_released.next, 314 * Issues smp function call to unbind a per cpu device. Called with
269 struct clock_event_device, list); 315 * clockevents_mutex held.
270 list_del(&dev->list); 316 */
271 list_add(&dev->list, &clockevent_devices); 317static int clockevents_unbind(struct clock_event_device *ced, int cpu)
272 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); 318{
273 } 319 struct ce_unbind cu = { .ce = ced, .res = -ENODEV };
320
321 smp_call_function_single(cpu, __clockevents_unbind, &cu, 1);
322 return cu.res;
274} 323}
275 324
325/*
326 * Unbind a clockevents device.
327 */
328int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
329{
330 int ret;
331
332 mutex_lock(&clockevents_mutex);
333 ret = clockevents_unbind(ced, cpu);
334 mutex_unlock(&clockevents_mutex);
335 return ret;
336}
337EXPORT_SYMBOL_GPL(clockevents_unbind);
338
276/** 339/**
277 * clockevents_register_device - register a clock event device 340 * clockevents_register_device - register a clock event device
278 * @dev: device to register 341 * @dev: device to register
@@ -290,7 +353,7 @@ void clockevents_register_device(struct clock_event_device *dev)
290 raw_spin_lock_irqsave(&clockevents_lock, flags); 353 raw_spin_lock_irqsave(&clockevents_lock, flags);
291 354
292 list_add(&dev->list, &clockevent_devices); 355 list_add(&dev->list, &clockevent_devices);
293 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); 356 tick_check_new_device(dev);
294 clockevents_notify_released(); 357 clockevents_notify_released();
295 358
296 raw_spin_unlock_irqrestore(&clockevents_lock, flags); 359 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
@@ -386,6 +449,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
386 * released list and do a notify add later. 449 * released list and do a notify add later.
387 */ 450 */
388 if (old) { 451 if (old) {
452 module_put(old->owner);
389 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); 453 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
390 list_del(&old->list); 454 list_del(&old->list);
391 list_add(&old->list, &clockevents_released); 455 list_add(&old->list, &clockevents_released);
@@ -433,10 +497,36 @@ void clockevents_notify(unsigned long reason, void *arg)
433 int cpu; 497 int cpu;
434 498
435 raw_spin_lock_irqsave(&clockevents_lock, flags); 499 raw_spin_lock_irqsave(&clockevents_lock, flags);
436 clockevents_do_notify(reason, arg);
437 500
438 switch (reason) { 501 switch (reason) {
502 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
503 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
504 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
505 tick_broadcast_on_off(reason, arg);
506 break;
507
508 case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
509 case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
510 tick_broadcast_oneshot_control(reason);
511 break;
512
513 case CLOCK_EVT_NOTIFY_CPU_DYING:
514 tick_handover_do_timer(arg);
515 break;
516
517 case CLOCK_EVT_NOTIFY_SUSPEND:
518 tick_suspend();
519 tick_suspend_broadcast();
520 break;
521
522 case CLOCK_EVT_NOTIFY_RESUME:
523 tick_resume();
524 break;
525
439 case CLOCK_EVT_NOTIFY_CPU_DEAD: 526 case CLOCK_EVT_NOTIFY_CPU_DEAD:
527 tick_shutdown_broadcast_oneshot(arg);
528 tick_shutdown_broadcast(arg);
529 tick_shutdown(arg);
440 /* 530 /*
441 * Unregister the clock event devices which were 531 * Unregister the clock event devices which were
442 * released from the users in the notify chain. 532 * released from the users in the notify chain.
@@ -462,4 +552,123 @@ void clockevents_notify(unsigned long reason, void *arg)
462 raw_spin_unlock_irqrestore(&clockevents_lock, flags); 552 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
463} 553}
464EXPORT_SYMBOL_GPL(clockevents_notify); 554EXPORT_SYMBOL_GPL(clockevents_notify);
555
556#ifdef CONFIG_SYSFS
557struct bus_type clockevents_subsys = {
558 .name = "clockevents",
559 .dev_name = "clockevent",
560};
561
562static DEFINE_PER_CPU(struct device, tick_percpu_dev);
563static struct tick_device *tick_get_tick_dev(struct device *dev);
564
565static ssize_t sysfs_show_current_tick_dev(struct device *dev,
566 struct device_attribute *attr,
567 char *buf)
568{
569 struct tick_device *td;
570 ssize_t count = 0;
571
572 raw_spin_lock_irq(&clockevents_lock);
573 td = tick_get_tick_dev(dev);
574 if (td && td->evtdev)
575 count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name);
576 raw_spin_unlock_irq(&clockevents_lock);
577 return count;
578}
579static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL);
580
581/* We don't support the abomination of removable broadcast devices */
582static ssize_t sysfs_unbind_tick_dev(struct device *dev,
583 struct device_attribute *attr,
584 const char *buf, size_t count)
585{
586 char name[CS_NAME_LEN];
587 size_t ret = sysfs_get_uname(buf, name, count);
588 struct clock_event_device *ce;
589
590 if (ret < 0)
591 return ret;
592
593 ret = -ENODEV;
594 mutex_lock(&clockevents_mutex);
595 raw_spin_lock_irq(&clockevents_lock);
596 list_for_each_entry(ce, &clockevent_devices, list) {
597 if (!strcmp(ce->name, name)) {
598 ret = __clockevents_try_unbind(ce, dev->id);
599 break;
600 }
601 }
602 raw_spin_unlock_irq(&clockevents_lock);
603 /*
604 * We hold clockevents_mutex, so ce can't go away
605 */
606 if (ret == -EAGAIN)
607 ret = clockevents_unbind(ce, dev->id);
608 mutex_unlock(&clockevents_mutex);
609 return ret ? ret : count;
610}
611static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev);
612
613#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
614static struct device tick_bc_dev = {
615 .init_name = "broadcast",
616 .id = 0,
617 .bus = &clockevents_subsys,
618};
619
620static struct tick_device *tick_get_tick_dev(struct device *dev)
621{
622 return dev == &tick_bc_dev ? tick_get_broadcast_device() :
623 &per_cpu(tick_cpu_device, dev->id);
624}
625
626static __init int tick_broadcast_init_sysfs(void)
627{
628 int err = device_register(&tick_bc_dev);
629
630 if (!err)
631 err = device_create_file(&tick_bc_dev, &dev_attr_current_device);
632 return err;
633}
634#else
635static struct tick_device *tick_get_tick_dev(struct device *dev)
636{
637 return &per_cpu(tick_cpu_device, dev->id);
638}
639static inline int tick_broadcast_init_sysfs(void) { return 0; }
465#endif 640#endif
641
642static int __init tick_init_sysfs(void)
643{
644 int cpu;
645
646 for_each_possible_cpu(cpu) {
647 struct device *dev = &per_cpu(tick_percpu_dev, cpu);
648 int err;
649
650 dev->id = cpu;
651 dev->bus = &clockevents_subsys;
652 err = device_register(dev);
653 if (!err)
654 err = device_create_file(dev, &dev_attr_current_device);
655 if (!err)
656 err = device_create_file(dev, &dev_attr_unbind_device);
657 if (err)
658 return err;
659 }
660 return tick_broadcast_init_sysfs();
661}
662
663static int __init clockevents_init_sysfs(void)
664{
665 int err = subsys_system_register(&clockevents_subsys, NULL);
666
667 if (!err)
668 err = tick_init_sysfs();
669 return err;
670}
671device_initcall(clockevents_init_sysfs);
672#endif /* SYSFS */
673
674#endif /* GENERIC_CLOCK_EVENTS */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c9583382141a..50a8736757f3 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -31,6 +31,8 @@
31#include <linux/tick.h> 31#include <linux/tick.h>
32#include <linux/kthread.h> 32#include <linux/kthread.h>
33 33
34#include "tick-internal.h"
35
34void timecounter_init(struct timecounter *tc, 36void timecounter_init(struct timecounter *tc,
35 const struct cyclecounter *cc, 37 const struct cyclecounter *cc,
36 u64 start_tstamp) 38 u64 start_tstamp)
@@ -174,11 +176,12 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
174static struct clocksource *curr_clocksource; 176static struct clocksource *curr_clocksource;
175static LIST_HEAD(clocksource_list); 177static LIST_HEAD(clocksource_list);
176static DEFINE_MUTEX(clocksource_mutex); 178static DEFINE_MUTEX(clocksource_mutex);
177static char override_name[32]; 179static char override_name[CS_NAME_LEN];
178static int finished_booting; 180static int finished_booting;
179 181
180#ifdef CONFIG_CLOCKSOURCE_WATCHDOG 182#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
181static void clocksource_watchdog_work(struct work_struct *work); 183static void clocksource_watchdog_work(struct work_struct *work);
184static void clocksource_select(void);
182 185
183static LIST_HEAD(watchdog_list); 186static LIST_HEAD(watchdog_list);
184static struct clocksource *watchdog; 187static struct clocksource *watchdog;
@@ -299,13 +302,30 @@ static void clocksource_watchdog(unsigned long data)
299 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && 302 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
300 (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && 303 (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
301 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { 304 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
305 /* Mark it valid for high-res. */
302 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 306 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
307
308 /*
309 * clocksource_done_booting() will sort it if
310 * finished_booting is not set yet.
311 */
312 if (!finished_booting)
313 continue;
314
303 /* 315 /*
304 * We just marked the clocksource as highres-capable, 316 * If this is not the current clocksource let
305 * notify the rest of the system as well so that we 317 * the watchdog thread reselect it. Due to the
306 * transition into high-res mode: 318 * change to high res this clocksource might
319 * be preferred now. If it is the current
320 * clocksource let the tick code know about
321 * that change.
307 */ 322 */
308 tick_clock_notify(); 323 if (cs != curr_clocksource) {
324 cs->flags |= CLOCK_SOURCE_RESELECT;
325 schedule_work(&watchdog_work);
326 } else {
327 tick_clock_notify();
328 }
309 } 329 }
310 } 330 }
311 331
@@ -388,44 +408,39 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
388 408
389static void clocksource_dequeue_watchdog(struct clocksource *cs) 409static void clocksource_dequeue_watchdog(struct clocksource *cs)
390{ 410{
391 struct clocksource *tmp;
392 unsigned long flags; 411 unsigned long flags;
393 412
394 spin_lock_irqsave(&watchdog_lock, flags); 413 spin_lock_irqsave(&watchdog_lock, flags);
395 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { 414 if (cs != watchdog) {
396 /* cs is a watched clocksource. */ 415 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
397 list_del_init(&cs->wd_list); 416 /* cs is a watched clocksource. */
398 } else if (cs == watchdog) { 417 list_del_init(&cs->wd_list);
399 /* Reset watchdog cycles */ 418 /* Check if the watchdog timer needs to be stopped. */
400 clocksource_reset_watchdog(); 419 clocksource_stop_watchdog();
401 /* Current watchdog is removed. Find an alternative. */
402 watchdog = NULL;
403 list_for_each_entry(tmp, &clocksource_list, list) {
404 if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY)
405 continue;
406 if (!watchdog || tmp->rating > watchdog->rating)
407 watchdog = tmp;
408 } 420 }
409 } 421 }
410 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
411 /* Check if the watchdog timer needs to be stopped. */
412 clocksource_stop_watchdog();
413 spin_unlock_irqrestore(&watchdog_lock, flags); 422 spin_unlock_irqrestore(&watchdog_lock, flags);
414} 423}
415 424
416static int clocksource_watchdog_kthread(void *data) 425static int __clocksource_watchdog_kthread(void)
417{ 426{
418 struct clocksource *cs, *tmp; 427 struct clocksource *cs, *tmp;
419 unsigned long flags; 428 unsigned long flags;
420 LIST_HEAD(unstable); 429 LIST_HEAD(unstable);
430 int select = 0;
421 431
422 mutex_lock(&clocksource_mutex);
423 spin_lock_irqsave(&watchdog_lock, flags); 432 spin_lock_irqsave(&watchdog_lock, flags);
424 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) 433 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
425 if (cs->flags & CLOCK_SOURCE_UNSTABLE) { 434 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
426 list_del_init(&cs->wd_list); 435 list_del_init(&cs->wd_list);
427 list_add(&cs->wd_list, &unstable); 436 list_add(&cs->wd_list, &unstable);
437 select = 1;
428 } 438 }
439 if (cs->flags & CLOCK_SOURCE_RESELECT) {
440 cs->flags &= ~CLOCK_SOURCE_RESELECT;
441 select = 1;
442 }
443 }
429 /* Check if the watchdog timer needs to be stopped. */ 444 /* Check if the watchdog timer needs to be stopped. */
430 clocksource_stop_watchdog(); 445 clocksource_stop_watchdog();
431 spin_unlock_irqrestore(&watchdog_lock, flags); 446 spin_unlock_irqrestore(&watchdog_lock, flags);
@@ -435,10 +450,23 @@ static int clocksource_watchdog_kthread(void *data)
435 list_del_init(&cs->wd_list); 450 list_del_init(&cs->wd_list);
436 __clocksource_change_rating(cs, 0); 451 __clocksource_change_rating(cs, 0);
437 } 452 }
453 return select;
454}
455
456static int clocksource_watchdog_kthread(void *data)
457{
458 mutex_lock(&clocksource_mutex);
459 if (__clocksource_watchdog_kthread())
460 clocksource_select();
438 mutex_unlock(&clocksource_mutex); 461 mutex_unlock(&clocksource_mutex);
439 return 0; 462 return 0;
440} 463}
441 464
465static bool clocksource_is_watchdog(struct clocksource *cs)
466{
467 return cs == watchdog;
468}
469
442#else /* CONFIG_CLOCKSOURCE_WATCHDOG */ 470#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
443 471
444static void clocksource_enqueue_watchdog(struct clocksource *cs) 472static void clocksource_enqueue_watchdog(struct clocksource *cs)
@@ -449,7 +477,8 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
449 477
450static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } 478static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
451static inline void clocksource_resume_watchdog(void) { } 479static inline void clocksource_resume_watchdog(void) { }
452static inline int clocksource_watchdog_kthread(void *data) { return 0; } 480static inline int __clocksource_watchdog_kthread(void) { return 0; }
481static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
453 482
454#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ 483#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
455 484
@@ -553,24 +582,42 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
553 582
554#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET 583#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
555 584
556/** 585static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
557 * clocksource_select - Select the best clocksource available
558 *
559 * Private function. Must hold clocksource_mutex when called.
560 *
561 * Select the clocksource with the best rating, or the clocksource,
562 * which is selected by userspace override.
563 */
564static void clocksource_select(void)
565{ 586{
566 struct clocksource *best, *cs; 587 struct clocksource *cs;
567 588
568 if (!finished_booting || list_empty(&clocksource_list)) 589 if (!finished_booting || list_empty(&clocksource_list))
590 return NULL;
591
592 /*
593 * We pick the clocksource with the highest rating. If oneshot
594 * mode is active, we pick the highres valid clocksource with
595 * the best rating.
596 */
597 list_for_each_entry(cs, &clocksource_list, list) {
598 if (skipcur && cs == curr_clocksource)
599 continue;
600 if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES))
601 continue;
602 return cs;
603 }
604 return NULL;
605}
606
607static void __clocksource_select(bool skipcur)
608{
609 bool oneshot = tick_oneshot_mode_active();
610 struct clocksource *best, *cs;
611
612 /* Find the best suitable clocksource */
613 best = clocksource_find_best(oneshot, skipcur);
614 if (!best)
569 return; 615 return;
570 /* First clocksource on the list has the best rating. */ 616
571 best = list_first_entry(&clocksource_list, struct clocksource, list);
572 /* Check for the override clocksource. */ 617 /* Check for the override clocksource. */
573 list_for_each_entry(cs, &clocksource_list, list) { 618 list_for_each_entry(cs, &clocksource_list, list) {
619 if (skipcur && cs == curr_clocksource)
620 continue;
574 if (strcmp(cs->name, override_name) != 0) 621 if (strcmp(cs->name, override_name) != 0)
575 continue; 622 continue;
576 /* 623 /*
@@ -578,8 +625,7 @@ static void clocksource_select(void)
578 * capable clocksource if the tick code is in oneshot 625 * capable clocksource if the tick code is in oneshot
579 * mode (highres or nohz) 626 * mode (highres or nohz)
580 */ 627 */
581 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && 628 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
582 tick_oneshot_mode_active()) {
583 /* Override clocksource cannot be used. */ 629 /* Override clocksource cannot be used. */
584 printk(KERN_WARNING "Override clocksource %s is not " 630 printk(KERN_WARNING "Override clocksource %s is not "
585 "HRT compatible. Cannot switch while in " 631 "HRT compatible. Cannot switch while in "
@@ -590,16 +636,35 @@ static void clocksource_select(void)
590 best = cs; 636 best = cs;
591 break; 637 break;
592 } 638 }
593 if (curr_clocksource != best) { 639
594 printk(KERN_INFO "Switching to clocksource %s\n", best->name); 640 if (curr_clocksource != best && !timekeeping_notify(best)) {
641 pr_info("Switched to clocksource %s\n", best->name);
595 curr_clocksource = best; 642 curr_clocksource = best;
596 timekeeping_notify(curr_clocksource);
597 } 643 }
598} 644}
599 645
646/**
647 * clocksource_select - Select the best clocksource available
648 *
649 * Private function. Must hold clocksource_mutex when called.
650 *
651 * Select the clocksource with the best rating, or the clocksource,
652 * which is selected by userspace override.
653 */
654static void clocksource_select(void)
655{
656 return __clocksource_select(false);
657}
658
659static void clocksource_select_fallback(void)
660{
661 return __clocksource_select(true);
662}
663
600#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ 664#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
601 665
602static inline void clocksource_select(void) { } 666static inline void clocksource_select(void) { }
667static inline void clocksource_select_fallback(void) { }
603 668
604#endif 669#endif
605 670
@@ -614,16 +679,11 @@ static int __init clocksource_done_booting(void)
614{ 679{
615 mutex_lock(&clocksource_mutex); 680 mutex_lock(&clocksource_mutex);
616 curr_clocksource = clocksource_default_clock(); 681 curr_clocksource = clocksource_default_clock();
617 mutex_unlock(&clocksource_mutex);
618
619 finished_booting = 1; 682 finished_booting = 1;
620
621 /* 683 /*
622 * Run the watchdog first to eliminate unstable clock sources 684 * Run the watchdog first to eliminate unstable clock sources
623 */ 685 */
624 clocksource_watchdog_kthread(NULL); 686 __clocksource_watchdog_kthread();
625
626 mutex_lock(&clocksource_mutex);
627 clocksource_select(); 687 clocksource_select();
628 mutex_unlock(&clocksource_mutex); 688 mutex_unlock(&clocksource_mutex);
629 return 0; 689 return 0;
@@ -756,7 +816,6 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating)
756 list_del(&cs->list); 816 list_del(&cs->list);
757 cs->rating = rating; 817 cs->rating = rating;
758 clocksource_enqueue(cs); 818 clocksource_enqueue(cs);
759 clocksource_select();
760} 819}
761 820
762/** 821/**
@@ -768,21 +827,47 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
768{ 827{
769 mutex_lock(&clocksource_mutex); 828 mutex_lock(&clocksource_mutex);
770 __clocksource_change_rating(cs, rating); 829 __clocksource_change_rating(cs, rating);
830 clocksource_select();
771 mutex_unlock(&clocksource_mutex); 831 mutex_unlock(&clocksource_mutex);
772} 832}
773EXPORT_SYMBOL(clocksource_change_rating); 833EXPORT_SYMBOL(clocksource_change_rating);
774 834
835/*
836 * Unbind clocksource @cs. Called with clocksource_mutex held
837 */
838static int clocksource_unbind(struct clocksource *cs)
839{
840 /*
841 * I really can't convince myself to support this on hardware
842 * designed by lobotomized monkeys.
843 */
844 if (clocksource_is_watchdog(cs))
845 return -EBUSY;
846
847 if (cs == curr_clocksource) {
848 /* Select and try to install a replacement clock source */
849 clocksource_select_fallback();
850 if (curr_clocksource == cs)
851 return -EBUSY;
852 }
853 clocksource_dequeue_watchdog(cs);
854 list_del_init(&cs->list);
855 return 0;
856}
857
775/** 858/**
776 * clocksource_unregister - remove a registered clocksource 859 * clocksource_unregister - remove a registered clocksource
777 * @cs: clocksource to be unregistered 860 * @cs: clocksource to be unregistered
778 */ 861 */
779void clocksource_unregister(struct clocksource *cs) 862int clocksource_unregister(struct clocksource *cs)
780{ 863{
864 int ret = 0;
865
781 mutex_lock(&clocksource_mutex); 866 mutex_lock(&clocksource_mutex);
782 clocksource_dequeue_watchdog(cs); 867 if (!list_empty(&cs->list))
783 list_del(&cs->list); 868 ret = clocksource_unbind(cs);
784 clocksource_select();
785 mutex_unlock(&clocksource_mutex); 869 mutex_unlock(&clocksource_mutex);
870 return ret;
786} 871}
787EXPORT_SYMBOL(clocksource_unregister); 872EXPORT_SYMBOL(clocksource_unregister);
788 873
@@ -808,6 +893,23 @@ sysfs_show_current_clocksources(struct device *dev,
808 return count; 893 return count;
809} 894}
810 895
896size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
897{
898 size_t ret = cnt;
899
900 /* strings from sysfs write are not 0 terminated! */
901 if (!cnt || cnt >= CS_NAME_LEN)
902 return -EINVAL;
903
904 /* strip of \n: */
905 if (buf[cnt-1] == '\n')
906 cnt--;
907 if (cnt > 0)
908 memcpy(dst, buf, cnt);
909 dst[cnt] = 0;
910 return ret;
911}
912
811/** 913/**
812 * sysfs_override_clocksource - interface for manually overriding clocksource 914 * sysfs_override_clocksource - interface for manually overriding clocksource
813 * @dev: unused 915 * @dev: unused
@@ -822,22 +924,13 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
822 struct device_attribute *attr, 924 struct device_attribute *attr,
823 const char *buf, size_t count) 925 const char *buf, size_t count)
824{ 926{
825 size_t ret = count; 927 size_t ret;
826
827 /* strings from sysfs write are not 0 terminated! */
828 if (count >= sizeof(override_name))
829 return -EINVAL;
830
831 /* strip of \n: */
832 if (buf[count-1] == '\n')
833 count--;
834 928
835 mutex_lock(&clocksource_mutex); 929 mutex_lock(&clocksource_mutex);
836 930
837 if (count > 0) 931 ret = sysfs_get_uname(buf, override_name, count);
838 memcpy(override_name, buf, count); 932 if (ret >= 0)
839 override_name[count] = 0; 933 clocksource_select();
840 clocksource_select();
841 934
842 mutex_unlock(&clocksource_mutex); 935 mutex_unlock(&clocksource_mutex);
843 936
@@ -845,6 +938,40 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
845} 938}
846 939
847/** 940/**
941 * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource
942 * @dev: unused
943 * @attr: unused
944 * @buf: unused
945 * @count: length of buffer
946 *
947 * Takes input from sysfs interface for manually unbinding a clocksource.
948 */
949static ssize_t sysfs_unbind_clocksource(struct device *dev,
950 struct device_attribute *attr,
951 const char *buf, size_t count)
952{
953 struct clocksource *cs;
954 char name[CS_NAME_LEN];
955 size_t ret;
956
957 ret = sysfs_get_uname(buf, name, count);
958 if (ret < 0)
959 return ret;
960
961 ret = -ENODEV;
962 mutex_lock(&clocksource_mutex);
963 list_for_each_entry(cs, &clocksource_list, list) {
964 if (strcmp(cs->name, name))
965 continue;
966 ret = clocksource_unbind(cs);
967 break;
968 }
969 mutex_unlock(&clocksource_mutex);
970
971 return ret ? ret : count;
972}
973
974/**
848 * sysfs_show_available_clocksources - sysfs interface for listing clocksource 975 * sysfs_show_available_clocksources - sysfs interface for listing clocksource
849 * @dev: unused 976 * @dev: unused
850 * @attr: unused 977 * @attr: unused
@@ -886,6 +1013,8 @@ sysfs_show_available_clocksources(struct device *dev,
886static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, 1013static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
887 sysfs_override_clocksource); 1014 sysfs_override_clocksource);
888 1015
1016static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource);
1017
889static DEVICE_ATTR(available_clocksource, 0444, 1018static DEVICE_ATTR(available_clocksource, 0444,
890 sysfs_show_available_clocksources, NULL); 1019 sysfs_show_available_clocksources, NULL);
891 1020
@@ -910,6 +1039,9 @@ static int __init init_clocksource_sysfs(void)
910 &device_clocksource, 1039 &device_clocksource,
911 &dev_attr_current_clocksource); 1040 &dev_attr_current_clocksource);
912 if (!error) 1041 if (!error)
1042 error = device_create_file(&device_clocksource,
1043 &dev_attr_unbind_clocksource);
1044 if (!error)
913 error = device_create_file( 1045 error = device_create_file(
914 &device_clocksource, 1046 &device_clocksource,
915 &dev_attr_available_clocksource); 1047 &dev_attr_available_clocksource);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
new file mode 100644
index 000000000000..a326f27d7f09
--- /dev/null
+++ b/kernel/time/sched_clock.c
@@ -0,0 +1,212 @@
1/*
2 * sched_clock.c: support for extending counters to full 64-bit ns counter
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8#include <linux/clocksource.h>
9#include <linux/init.h>
10#include <linux/jiffies.h>
11#include <linux/kernel.h>
12#include <linux/moduleparam.h>
13#include <linux/sched.h>
14#include <linux/syscore_ops.h>
15#include <linux/timer.h>
16#include <linux/sched_clock.h>
17
18struct clock_data {
19 u64 epoch_ns;
20 u32 epoch_cyc;
21 u32 epoch_cyc_copy;
22 unsigned long rate;
23 u32 mult;
24 u32 shift;
25 bool suspended;
26};
27
28static void sched_clock_poll(unsigned long wrap_ticks);
29static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0);
30static int irqtime = -1;
31
32core_param(irqtime, irqtime, int, 0400);
33
34static struct clock_data cd = {
35 .mult = NSEC_PER_SEC / HZ,
36};
37
38static u32 __read_mostly sched_clock_mask = 0xffffffff;
39
40static u32 notrace jiffy_sched_clock_read(void)
41{
42 return (u32)(jiffies - INITIAL_JIFFIES);
43}
44
45static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
46
47static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
48{
49 return (cyc * mult) >> shift;
50}
51
52static unsigned long long notrace sched_clock_32(void)
53{
54 u64 epoch_ns;
55 u32 epoch_cyc;
56 u32 cyc;
57
58 if (cd.suspended)
59 return cd.epoch_ns;
60
61 /*
62 * Load the epoch_cyc and epoch_ns atomically. We do this by
63 * ensuring that we always write epoch_cyc, epoch_ns and
64 * epoch_cyc_copy in strict order, and read them in strict order.
65 * If epoch_cyc and epoch_cyc_copy are not equal, then we're in
66 * the middle of an update, and we should repeat the load.
67 */
68 do {
69 epoch_cyc = cd.epoch_cyc;
70 smp_rmb();
71 epoch_ns = cd.epoch_ns;
72 smp_rmb();
73 } while (epoch_cyc != cd.epoch_cyc_copy);
74
75 cyc = read_sched_clock();
76 cyc = (cyc - epoch_cyc) & sched_clock_mask;
77 return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
78}
79
80/*
81 * Atomically update the sched_clock epoch.
82 */
83static void notrace update_sched_clock(void)
84{
85 unsigned long flags;
86 u32 cyc;
87 u64 ns;
88
89 cyc = read_sched_clock();
90 ns = cd.epoch_ns +
91 cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
92 cd.mult, cd.shift);
93 /*
94 * Write epoch_cyc and epoch_ns in a way that the update is
95 * detectable in cyc_to_fixed_sched_clock().
96 */
97 raw_local_irq_save(flags);
98 cd.epoch_cyc_copy = cyc;
99 smp_wmb();
100 cd.epoch_ns = ns;
101 smp_wmb();
102 cd.epoch_cyc = cyc;
103 raw_local_irq_restore(flags);
104}
105
106static void sched_clock_poll(unsigned long wrap_ticks)
107{
108 mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks));
109 update_sched_clock();
110}
111
112void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
113{
114 unsigned long r, w;
115 u64 res, wrap;
116 char r_unit;
117
118 if (cd.rate > rate)
119 return;
120
121 BUG_ON(bits > 32);
122 WARN_ON(!irqs_disabled());
123 read_sched_clock = read;
124 sched_clock_mask = (1 << bits) - 1;
125 cd.rate = rate;
126
127 /* calculate the mult/shift to convert counter ticks to ns. */
128 clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0);
129
130 r = rate;
131 if (r >= 4000000) {
132 r /= 1000000;
133 r_unit = 'M';
134 } else if (r >= 1000) {
135 r /= 1000;
136 r_unit = 'k';
137 } else
138 r_unit = ' ';
139
140 /* calculate how many ns until we wrap */
141 wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift);
142 do_div(wrap, NSEC_PER_MSEC);
143 w = wrap;
144
145 /* calculate the ns resolution of this counter */
146 res = cyc_to_ns(1ULL, cd.mult, cd.shift);
147 pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n",
148 bits, r, r_unit, res, w);
149
150 /*
151 * Start the timer to keep sched_clock() properly updated and
152 * sets the initial epoch.
153 */
154 sched_clock_timer.data = msecs_to_jiffies(w - (w / 10));
155 update_sched_clock();
156
157 /*
158 * Ensure that sched_clock() starts off at 0ns
159 */
160 cd.epoch_ns = 0;
161
162 /* Enable IRQ time accounting if we have a fast enough sched_clock */
163 if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
164 enable_sched_clock_irqtime();
165
166 pr_debug("Registered %pF as sched_clock source\n", read);
167}
168
169unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32;
170
171unsigned long long notrace sched_clock(void)
172{
173 return sched_clock_func();
174}
175
176void __init sched_clock_postinit(void)
177{
178 /*
179 * If no sched_clock function has been provided at that point,
180 * make it the final one one.
181 */
182 if (read_sched_clock == jiffy_sched_clock_read)
183 setup_sched_clock(jiffy_sched_clock_read, 32, HZ);
184
185 sched_clock_poll(sched_clock_timer.data);
186}
187
188static int sched_clock_suspend(void)
189{
190 sched_clock_poll(sched_clock_timer.data);
191 cd.suspended = true;
192 return 0;
193}
194
195static void sched_clock_resume(void)
196{
197 cd.epoch_cyc = read_sched_clock();
198 cd.epoch_cyc_copy = cd.epoch_cyc;
199 cd.suspended = false;
200}
201
202static struct syscore_ops sched_clock_ops = {
203 .suspend = sched_clock_suspend,
204 .resume = sched_clock_resume,
205};
206
207static int __init sched_clock_syscore_init(void)
208{
209 register_syscore_ops(&sched_clock_ops);
210 return 0;
211}
212device_initcall(sched_clock_syscore_init);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 20d6fba70652..218bcb565fed 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -19,6 +19,7 @@
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/smp.h> 21#include <linux/smp.h>
22#include <linux/module.h>
22 23
23#include "tick-internal.h" 24#include "tick-internal.h"
24 25
@@ -29,6 +30,7 @@
29 30
30static struct tick_device tick_broadcast_device; 31static struct tick_device tick_broadcast_device;
31static cpumask_var_t tick_broadcast_mask; 32static cpumask_var_t tick_broadcast_mask;
33static cpumask_var_t tick_broadcast_on;
32static cpumask_var_t tmpmask; 34static cpumask_var_t tmpmask;
33static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); 35static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
34static int tick_broadcast_force; 36static int tick_broadcast_force;
@@ -64,17 +66,34 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
64/* 66/*
65 * Check, if the device can be utilized as broadcast device: 67 * Check, if the device can be utilized as broadcast device:
66 */ 68 */
67int tick_check_broadcast_device(struct clock_event_device *dev) 69static bool tick_check_broadcast_device(struct clock_event_device *curdev,
70 struct clock_event_device *newdev)
71{
72 if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
73 (newdev->features & CLOCK_EVT_FEAT_C3STOP))
74 return false;
75
76 if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT &&
77 !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
78 return false;
79
80 return !curdev || newdev->rating > curdev->rating;
81}
82
83/*
84 * Conditionally install/replace broadcast device
85 */
86void tick_install_broadcast_device(struct clock_event_device *dev)
68{ 87{
69 struct clock_event_device *cur = tick_broadcast_device.evtdev; 88 struct clock_event_device *cur = tick_broadcast_device.evtdev;
70 89
71 if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || 90 if (!tick_check_broadcast_device(cur, dev))
72 (tick_broadcast_device.evtdev && 91 return;
73 tick_broadcast_device.evtdev->rating >= dev->rating) || 92
74 (dev->features & CLOCK_EVT_FEAT_C3STOP)) 93 if (!try_module_get(dev->owner))
75 return 0; 94 return;
76 95
77 clockevents_exchange_device(tick_broadcast_device.evtdev, dev); 96 clockevents_exchange_device(cur, dev);
78 if (cur) 97 if (cur)
79 cur->event_handler = clockevents_handle_noop; 98 cur->event_handler = clockevents_handle_noop;
80 tick_broadcast_device.evtdev = dev; 99 tick_broadcast_device.evtdev = dev;
@@ -90,7 +109,6 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
90 */ 109 */
91 if (dev->features & CLOCK_EVT_FEAT_ONESHOT) 110 if (dev->features & CLOCK_EVT_FEAT_ONESHOT)
92 tick_clock_notify(); 111 tick_clock_notify();
93 return 1;
94} 112}
95 113
96/* 114/*
@@ -123,8 +141,9 @@ static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
123 */ 141 */
124int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) 142int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
125{ 143{
144 struct clock_event_device *bc = tick_broadcast_device.evtdev;
126 unsigned long flags; 145 unsigned long flags;
127 int ret = 0; 146 int ret;
128 147
129 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 148 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
130 149
@@ -138,20 +157,62 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
138 dev->event_handler = tick_handle_periodic; 157 dev->event_handler = tick_handle_periodic;
139 tick_device_setup_broadcast_func(dev); 158 tick_device_setup_broadcast_func(dev);
140 cpumask_set_cpu(cpu, tick_broadcast_mask); 159 cpumask_set_cpu(cpu, tick_broadcast_mask);
141 tick_broadcast_start_periodic(tick_broadcast_device.evtdev); 160 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
161 tick_broadcast_start_periodic(bc);
162 else
163 tick_broadcast_setup_oneshot(bc);
142 ret = 1; 164 ret = 1;
143 } else { 165 } else {
144 /* 166 /*
145 * When the new device is not affected by the stop 167 * Clear the broadcast bit for this cpu if the
146 * feature and the cpu is marked in the broadcast mask 168 * device is not power state affected.
147 * then clear the broadcast bit.
148 */ 169 */
149 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { 170 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
150 int cpu = smp_processor_id();
151 cpumask_clear_cpu(cpu, tick_broadcast_mask); 171 cpumask_clear_cpu(cpu, tick_broadcast_mask);
152 tick_broadcast_clear_oneshot(cpu); 172 else
153 } else {
154 tick_device_setup_broadcast_func(dev); 173 tick_device_setup_broadcast_func(dev);
174
175 /*
176 * Clear the broadcast bit if the CPU is not in
177 * periodic broadcast on state.
178 */
179 if (!cpumask_test_cpu(cpu, tick_broadcast_on))
180 cpumask_clear_cpu(cpu, tick_broadcast_mask);
181
182 switch (tick_broadcast_device.mode) {
183 case TICKDEV_MODE_ONESHOT:
184 /*
185 * If the system is in oneshot mode we can
186 * unconditionally clear the oneshot mask bit,
187 * because the CPU is running and therefore
188 * not in an idle state which causes the power
189 * state affected device to stop. Let the
190 * caller initialize the device.
191 */
192 tick_broadcast_clear_oneshot(cpu);
193 ret = 0;
194 break;
195
196 case TICKDEV_MODE_PERIODIC:
197 /*
198 * If the system is in periodic mode, check
199 * whether the broadcast device can be
200 * switched off now.
201 */
202 if (cpumask_empty(tick_broadcast_mask) && bc)
203 clockevents_shutdown(bc);
204 /*
205 * If we kept the cpu in the broadcast mask,
206 * tell the caller to leave the per cpu device
207 * in shutdown state. The periodic interrupt
208 * is delivered by the broadcast device.
209 */
210 ret = cpumask_test_cpu(cpu, tick_broadcast_mask);
211 break;
212 default:
213 /* Nothing to do */
214 ret = 0;
215 break;
155 } 216 }
156 } 217 }
157 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 218 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -281,6 +342,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
281 switch (*reason) { 342 switch (*reason) {
282 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 343 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
283 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 344 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
345 cpumask_set_cpu(cpu, tick_broadcast_on);
284 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { 346 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
285 if (tick_broadcast_device.mode == 347 if (tick_broadcast_device.mode ==
286 TICKDEV_MODE_PERIODIC) 348 TICKDEV_MODE_PERIODIC)
@@ -290,8 +352,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
290 tick_broadcast_force = 1; 352 tick_broadcast_force = 1;
291 break; 353 break;
292 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 354 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
293 if (!tick_broadcast_force && 355 if (tick_broadcast_force)
294 cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { 356 break;
357 cpumask_clear_cpu(cpu, tick_broadcast_on);
358 if (!tick_device_is_functional(dev))
359 break;
360 if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
295 if (tick_broadcast_device.mode == 361 if (tick_broadcast_device.mode ==
296 TICKDEV_MODE_PERIODIC) 362 TICKDEV_MODE_PERIODIC)
297 tick_setup_periodic(dev, 0); 363 tick_setup_periodic(dev, 0);
@@ -349,6 +415,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
349 415
350 bc = tick_broadcast_device.evtdev; 416 bc = tick_broadcast_device.evtdev;
351 cpumask_clear_cpu(cpu, tick_broadcast_mask); 417 cpumask_clear_cpu(cpu, tick_broadcast_mask);
418 cpumask_clear_cpu(cpu, tick_broadcast_on);
352 419
353 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { 420 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
354 if (bc && cpumask_empty(tick_broadcast_mask)) 421 if (bc && cpumask_empty(tick_broadcast_mask))
@@ -475,7 +542,15 @@ void tick_check_oneshot_broadcast(int cpu)
475 if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) { 542 if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) {
476 struct tick_device *td = &per_cpu(tick_cpu_device, cpu); 543 struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
477 544
478 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); 545 /*
546 * We might be in the middle of switching over from
547 * periodic to oneshot. If the CPU has not yet
548 * switched over, leave the device alone.
549 */
550 if (td->mode == TICKDEV_MODE_ONESHOT) {
551 clockevents_set_mode(td->evtdev,
552 CLOCK_EVT_MODE_ONESHOT);
553 }
479 } 554 }
480} 555}
481 556
@@ -522,6 +597,13 @@ again:
522 cpumask_clear(tick_broadcast_force_mask); 597 cpumask_clear(tick_broadcast_force_mask);
523 598
524 /* 599 /*
600 * Sanity check. Catch the case where we try to broadcast to
601 * offline cpus.
602 */
603 if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask)))
604 cpumask_and(tmpmask, tmpmask, cpu_online_mask);
605
606 /*
525 * Wakeup the cpus which have an expired event. 607 * Wakeup the cpus which have an expired event.
526 */ 608 */
527 tick_do_broadcast(tmpmask); 609 tick_do_broadcast(tmpmask);
@@ -761,10 +843,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
761 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 843 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
762 844
763 /* 845 /*
764 * Clear the broadcast mask flag for the dead cpu, but do not 846 * Clear the broadcast masks for the dead cpu, but do not stop
765 * stop the broadcast device! 847 * the broadcast device!
766 */ 848 */
767 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); 849 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
850 cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
851 cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
768 852
769 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 853 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
770} 854}
@@ -792,6 +876,7 @@ bool tick_broadcast_oneshot_available(void)
792void __init tick_broadcast_init(void) 876void __init tick_broadcast_init(void)
793{ 877{
794 zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); 878 zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
879 zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT);
795 zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); 880 zalloc_cpumask_var(&tmpmask, GFP_NOWAIT);
796#ifdef CONFIG_TICK_ONESHOT 881#ifdef CONFIG_TICK_ONESHOT
797 zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); 882 zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 5d3fb100bc06..64522ecdfe0e 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -18,6 +18,7 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/module.h>
21 22
22#include <asm/irq_regs.h> 23#include <asm/irq_regs.h>
23 24
@@ -33,7 +34,6 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
33ktime_t tick_next_period; 34ktime_t tick_next_period;
34ktime_t tick_period; 35ktime_t tick_period;
35int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; 36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
36static DEFINE_RAW_SPINLOCK(tick_device_lock);
37 37
38/* 38/*
39 * Debugging: see timer_list.c 39 * Debugging: see timer_list.c
@@ -194,7 +194,8 @@ static void tick_setup_device(struct tick_device *td,
194 * When global broadcasting is active, check if the current 194 * When global broadcasting is active, check if the current
195 * device is registered as a placeholder for broadcast mode. 195 * device is registered as a placeholder for broadcast mode.
196 * This allows us to handle this x86 misfeature in a generic 196 * This allows us to handle this x86 misfeature in a generic
197 * way. 197 * way. This function also returns !=0 when we keep the
198 * current active broadcast state for this CPU.
198 */ 199 */
199 if (tick_device_uses_broadcast(newdev, cpu)) 200 if (tick_device_uses_broadcast(newdev, cpu))
200 return; 201 return;
@@ -205,17 +206,75 @@ static void tick_setup_device(struct tick_device *td,
205 tick_setup_oneshot(newdev, handler, next_event); 206 tick_setup_oneshot(newdev, handler, next_event);
206} 207}
207 208
209void tick_install_replacement(struct clock_event_device *newdev)
210{
211 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
212 int cpu = smp_processor_id();
213
214 clockevents_exchange_device(td->evtdev, newdev);
215 tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
216 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
217 tick_oneshot_notify();
218}
219
220static bool tick_check_percpu(struct clock_event_device *curdev,
221 struct clock_event_device *newdev, int cpu)
222{
223 if (!cpumask_test_cpu(cpu, newdev->cpumask))
224 return false;
225 if (cpumask_equal(newdev->cpumask, cpumask_of(cpu)))
226 return true;
227 /* Check if irq affinity can be set */
228 if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq))
229 return false;
230 /* Prefer an existing cpu local device */
231 if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
232 return false;
233 return true;
234}
235
236static bool tick_check_preferred(struct clock_event_device *curdev,
237 struct clock_event_device *newdev)
238{
239 /* Prefer oneshot capable device */
240 if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) {
241 if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT))
242 return false;
243 if (tick_oneshot_mode_active())
244 return false;
245 }
246
247 /*
248 * Use the higher rated one, but prefer a CPU local device with a lower
249 * rating than a non-CPU local device
250 */
251 return !curdev ||
252 newdev->rating > curdev->rating ||
253 !cpumask_equal(curdev->cpumask, newdev->cpumask);
254}
255
256/*
257 * Check whether the new device is a better fit than curdev. curdev
258 * can be NULL !
259 */
260bool tick_check_replacement(struct clock_event_device *curdev,
261 struct clock_event_device *newdev)
262{
263 if (tick_check_percpu(curdev, newdev, smp_processor_id()))
264 return false;
265
266 return tick_check_preferred(curdev, newdev);
267}
268
208/* 269/*
209 * Check, if the new registered device should be used. 270 * Check, if the new registered device should be used. Called with
271 * clockevents_lock held and interrupts disabled.
210 */ 272 */
211static int tick_check_new_device(struct clock_event_device *newdev) 273void tick_check_new_device(struct clock_event_device *newdev)
212{ 274{
213 struct clock_event_device *curdev; 275 struct clock_event_device *curdev;
214 struct tick_device *td; 276 struct tick_device *td;
215 int cpu, ret = NOTIFY_OK; 277 int cpu;
216 unsigned long flags;
217
218 raw_spin_lock_irqsave(&tick_device_lock, flags);
219 278
220 cpu = smp_processor_id(); 279 cpu = smp_processor_id();
221 if (!cpumask_test_cpu(cpu, newdev->cpumask)) 280 if (!cpumask_test_cpu(cpu, newdev->cpumask))
@@ -225,40 +284,15 @@ static int tick_check_new_device(struct clock_event_device *newdev)
225 curdev = td->evtdev; 284 curdev = td->evtdev;
226 285
227 /* cpu local device ? */ 286 /* cpu local device ? */
228 if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { 287 if (!tick_check_percpu(curdev, newdev, cpu))
229 288 goto out_bc;
230 /*
231 * If the cpu affinity of the device interrupt can not
232 * be set, ignore it.
233 */
234 if (!irq_can_set_affinity(newdev->irq))
235 goto out_bc;
236 289
237 /* 290 /* Preference decision */
238 * If we have a cpu local device already, do not replace it 291 if (!tick_check_preferred(curdev, newdev))
239 * by a non cpu local device 292 goto out_bc;
240 */
241 if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
242 goto out_bc;
243 }
244 293
245 /* 294 if (!try_module_get(newdev->owner))
246 * If we have an active device, then check the rating and the oneshot 295 return;
247 * feature.
248 */
249 if (curdev) {
250 /*
251 * Prefer one shot capable devices !
252 */
253 if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
254 !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
255 goto out_bc;
256 /*
257 * Check the rating
258 */
259 if (curdev->rating >= newdev->rating)
260 goto out_bc;
261 }
262 296
263 /* 297 /*
264 * Replace the eventually existing device by the new 298 * Replace the eventually existing device by the new
@@ -273,20 +307,13 @@ static int tick_check_new_device(struct clock_event_device *newdev)
273 tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); 307 tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
274 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) 308 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
275 tick_oneshot_notify(); 309 tick_oneshot_notify();
276 310 return;
277 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
278 return NOTIFY_STOP;
279 311
280out_bc: 312out_bc:
281 /* 313 /*
282 * Can the new device be used as a broadcast device ? 314 * Can the new device be used as a broadcast device ?
283 */ 315 */
284 if (tick_check_broadcast_device(newdev)) 316 tick_install_broadcast_device(newdev);
285 ret = NOTIFY_STOP;
286
287 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
288
289 return ret;
290} 317}
291 318
292/* 319/*
@@ -294,7 +321,7 @@ out_bc:
294 * 321 *
295 * Called with interrupts disabled. 322 * Called with interrupts disabled.
296 */ 323 */
297static void tick_handover_do_timer(int *cpup) 324void tick_handover_do_timer(int *cpup)
298{ 325{
299 if (*cpup == tick_do_timer_cpu) { 326 if (*cpup == tick_do_timer_cpu) {
300 int cpu = cpumask_first(cpu_online_mask); 327 int cpu = cpumask_first(cpu_online_mask);
@@ -311,13 +338,11 @@ static void tick_handover_do_timer(int *cpup)
311 * access the hardware device itself. 338 * access the hardware device itself.
312 * We just set the mode and remove it from the lists. 339 * We just set the mode and remove it from the lists.
313 */ 340 */
314static void tick_shutdown(unsigned int *cpup) 341void tick_shutdown(unsigned int *cpup)
315{ 342{
316 struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); 343 struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
317 struct clock_event_device *dev = td->evtdev; 344 struct clock_event_device *dev = td->evtdev;
318 unsigned long flags;
319 345
320 raw_spin_lock_irqsave(&tick_device_lock, flags);
321 td->mode = TICKDEV_MODE_PERIODIC; 346 td->mode = TICKDEV_MODE_PERIODIC;
322 if (dev) { 347 if (dev) {
323 /* 348 /*
@@ -329,26 +354,20 @@ static void tick_shutdown(unsigned int *cpup)
329 dev->event_handler = clockevents_handle_noop; 354 dev->event_handler = clockevents_handle_noop;
330 td->evtdev = NULL; 355 td->evtdev = NULL;
331 } 356 }
332 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
333} 357}
334 358
335static void tick_suspend(void) 359void tick_suspend(void)
336{ 360{
337 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 361 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
338 unsigned long flags;
339 362
340 raw_spin_lock_irqsave(&tick_device_lock, flags);
341 clockevents_shutdown(td->evtdev); 363 clockevents_shutdown(td->evtdev);
342 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
343} 364}
344 365
345static void tick_resume(void) 366void tick_resume(void)
346{ 367{
347 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 368 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
348 unsigned long flags;
349 int broadcast = tick_resume_broadcast(); 369 int broadcast = tick_resume_broadcast();
350 370
351 raw_spin_lock_irqsave(&tick_device_lock, flags);
352 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); 371 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
353 372
354 if (!broadcast) { 373 if (!broadcast) {
@@ -357,68 +376,12 @@ static void tick_resume(void)
357 else 376 else
358 tick_resume_oneshot(); 377 tick_resume_oneshot();
359 } 378 }
360 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
361} 379}
362 380
363/*
364 * Notification about clock event devices
365 */
366static int tick_notify(struct notifier_block *nb, unsigned long reason,
367 void *dev)
368{
369 switch (reason) {
370
371 case CLOCK_EVT_NOTIFY_ADD:
372 return tick_check_new_device(dev);
373
374 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
375 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
376 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
377 tick_broadcast_on_off(reason, dev);
378 break;
379
380 case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
381 case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
382 tick_broadcast_oneshot_control(reason);
383 break;
384
385 case CLOCK_EVT_NOTIFY_CPU_DYING:
386 tick_handover_do_timer(dev);
387 break;
388
389 case CLOCK_EVT_NOTIFY_CPU_DEAD:
390 tick_shutdown_broadcast_oneshot(dev);
391 tick_shutdown_broadcast(dev);
392 tick_shutdown(dev);
393 break;
394
395 case CLOCK_EVT_NOTIFY_SUSPEND:
396 tick_suspend();
397 tick_suspend_broadcast();
398 break;
399
400 case CLOCK_EVT_NOTIFY_RESUME:
401 tick_resume();
402 break;
403
404 default:
405 break;
406 }
407
408 return NOTIFY_OK;
409}
410
411static struct notifier_block tick_notifier = {
412 .notifier_call = tick_notify,
413};
414
415/** 381/**
416 * tick_init - initialize the tick control 382 * tick_init - initialize the tick control
417 *
418 * Register the notifier with the clockevents framework
419 */ 383 */
420void __init tick_init(void) 384void __init tick_init(void)
421{ 385{
422 clockevents_register_notifier(&tick_notifier);
423 tick_broadcast_init(); 386 tick_broadcast_init();
424} 387}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f0299eae4602..bc906cad709b 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -6,6 +6,8 @@
6 6
7extern seqlock_t jiffies_lock; 7extern seqlock_t jiffies_lock;
8 8
9#define CS_NAME_LEN 32
10
9#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD 11#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
10 12
11#define TICK_DO_TIMER_NONE -1 13#define TICK_DO_TIMER_NONE -1
@@ -18,9 +20,19 @@ extern int tick_do_timer_cpu __read_mostly;
18 20
19extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); 21extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
20extern void tick_handle_periodic(struct clock_event_device *dev); 22extern void tick_handle_periodic(struct clock_event_device *dev);
23extern void tick_check_new_device(struct clock_event_device *dev);
24extern void tick_handover_do_timer(int *cpup);
25extern void tick_shutdown(unsigned int *cpup);
26extern void tick_suspend(void);
27extern void tick_resume(void);
28extern bool tick_check_replacement(struct clock_event_device *curdev,
29 struct clock_event_device *newdev);
30extern void tick_install_replacement(struct clock_event_device *dev);
21 31
22extern void clockevents_shutdown(struct clock_event_device *dev); 32extern void clockevents_shutdown(struct clock_event_device *dev);
23 33
34extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
35
24/* 36/*
25 * NO_HZ / high resolution timer shared code 37 * NO_HZ / high resolution timer shared code
26 */ 38 */
@@ -90,7 +102,7 @@ static inline bool tick_broadcast_oneshot_available(void) { return false; }
90 */ 102 */
91#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 103#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
92extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); 104extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
93extern int tick_check_broadcast_device(struct clock_event_device *dev); 105extern void tick_install_broadcast_device(struct clock_event_device *dev);
94extern int tick_is_broadcast_device(struct clock_event_device *dev); 106extern int tick_is_broadcast_device(struct clock_event_device *dev);
95extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); 107extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
96extern void tick_shutdown_broadcast(unsigned int *cpup); 108extern void tick_shutdown_broadcast(unsigned int *cpup);
@@ -102,9 +114,8 @@ tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
102 114
103#else /* !BROADCAST */ 115#else /* !BROADCAST */
104 116
105static inline int tick_check_broadcast_device(struct clock_event_device *dev) 117static inline void tick_install_broadcast_device(struct clock_event_device *dev)
106{ 118{
107 return 0;
108} 119}
109 120
110static inline int tick_is_broadcast_device(struct clock_event_device *dev) 121static inline int tick_is_broadcast_device(struct clock_event_device *dev)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 0cf1c1453181..e77edc97e036 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -178,6 +178,11 @@ static bool can_stop_full_tick(void)
178 */ 178 */
179 if (!sched_clock_stable) { 179 if (!sched_clock_stable) {
180 trace_tick_stop(0, "unstable sched clock\n"); 180 trace_tick_stop(0, "unstable sched clock\n");
181 /*
182 * Don't allow the user to think they can get
183 * full NO_HZ with this machine.
184 */
185 WARN_ONCE(1, "NO_HZ FULL will not work with unstable sched clock");
181 return false; 186 return false;
182 } 187 }
183#endif 188#endif
@@ -293,7 +298,7 @@ static int __init tick_nohz_full_setup(char *str)
293} 298}
294__setup("nohz_full=", tick_nohz_full_setup); 299__setup("nohz_full=", tick_nohz_full_setup);
295 300
296static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, 301static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
297 unsigned long action, 302 unsigned long action,
298 void *hcpu) 303 void *hcpu)
299{ 304{
@@ -346,16 +351,6 @@ void __init tick_nohz_init(void)
346 } 351 }
347 352
348 cpu_notifier(tick_nohz_cpu_down_callback, 0); 353 cpu_notifier(tick_nohz_cpu_down_callback, 0);
349
350 /* Make sure full dynticks CPU are also RCU nocbs */
351 for_each_cpu(cpu, nohz_full_mask) {
352 if (!rcu_is_nocb_cpu(cpu)) {
353 pr_warning("NO_HZ: CPU %d is not RCU nocb: "
354 "cleared from nohz_full range", cpu);
355 cpumask_clear_cpu(cpu, nohz_full_mask);
356 }
357 }
358
359 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); 354 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask);
360 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); 355 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
361} 356}
@@ -832,13 +827,10 @@ void tick_nohz_irq_exit(void)
832{ 827{
833 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 828 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
834 829
835 if (ts->inidle) { 830 if (ts->inidle)
836 /* Cancel the timer because CPU already waken up from the C-states*/
837 menu_hrtimer_cancel();
838 __tick_nohz_idle_enter(ts); 831 __tick_nohz_idle_enter(ts);
839 } else { 832 else
840 tick_nohz_full_stop_tick(ts); 833 tick_nohz_full_stop_tick(ts);
841 }
842} 834}
843 835
844/** 836/**
@@ -936,8 +928,6 @@ void tick_nohz_idle_exit(void)
936 928
937 ts->inidle = 0; 929 ts->inidle = 0;
938 930
939 /* Cancel the timer because CPU already waken up from the C-states*/
940 menu_hrtimer_cancel();
941 if (ts->idle_active || ts->tick_stopped) 931 if (ts->idle_active || ts->tick_stopped)
942 now = ktime_get(); 932 now = ktime_get();
943 933
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index baeeb5c87cf1..48b9fffabdc2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -25,6 +25,11 @@
25 25
26#include "tick-internal.h" 26#include "tick-internal.h"
27#include "ntp_internal.h" 27#include "ntp_internal.h"
28#include "timekeeping_internal.h"
29
30#define TK_CLEAR_NTP (1 << 0)
31#define TK_MIRROR (1 << 1)
32#define TK_CLOCK_WAS_SET (1 << 2)
28 33
29static struct timekeeper timekeeper; 34static struct timekeeper timekeeper;
30static DEFINE_RAW_SPINLOCK(timekeeper_lock); 35static DEFINE_RAW_SPINLOCK(timekeeper_lock);
@@ -200,9 +205,9 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
200 205
201static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); 206static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
202 207
203static void update_pvclock_gtod(struct timekeeper *tk) 208static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
204{ 209{
205 raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk); 210 raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk);
206} 211}
207 212
208/** 213/**
@@ -216,7 +221,7 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb)
216 221
217 raw_spin_lock_irqsave(&timekeeper_lock, flags); 222 raw_spin_lock_irqsave(&timekeeper_lock, flags);
218 ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); 223 ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
219 update_pvclock_gtod(tk); 224 update_pvclock_gtod(tk, true);
220 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 225 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
221 226
222 return ret; 227 return ret;
@@ -241,16 +246,16 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
241EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); 246EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
242 247
243/* must hold timekeeper_lock */ 248/* must hold timekeeper_lock */
244static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror) 249static void timekeeping_update(struct timekeeper *tk, unsigned int action)
245{ 250{
246 if (clearntp) { 251 if (action & TK_CLEAR_NTP) {
247 tk->ntp_error = 0; 252 tk->ntp_error = 0;
248 ntp_clear(); 253 ntp_clear();
249 } 254 }
250 update_vsyscall(tk); 255 update_vsyscall(tk);
251 update_pvclock_gtod(tk); 256 update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
252 257
253 if (mirror) 258 if (action & TK_MIRROR)
254 memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); 259 memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
255} 260}
256 261
@@ -508,7 +513,7 @@ int do_settimeofday(const struct timespec *tv)
508 513
509 tk_set_xtime(tk, tv); 514 tk_set_xtime(tk, tv);
510 515
511 timekeeping_update(tk, true, true); 516 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
512 517
513 write_seqcount_end(&timekeeper_seq); 518 write_seqcount_end(&timekeeper_seq);
514 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 519 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -552,7 +557,7 @@ int timekeeping_inject_offset(struct timespec *ts)
552 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); 557 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
553 558
554error: /* even if we error out, we forwarded the time, so call update */ 559error: /* even if we error out, we forwarded the time, so call update */
555 timekeeping_update(tk, true, true); 560 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
556 561
557 write_seqcount_end(&timekeeper_seq); 562 write_seqcount_end(&timekeeper_seq);
558 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 563 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -627,13 +632,22 @@ static int change_clocksource(void *data)
627 write_seqcount_begin(&timekeeper_seq); 632 write_seqcount_begin(&timekeeper_seq);
628 633
629 timekeeping_forward_now(tk); 634 timekeeping_forward_now(tk);
630 if (!new->enable || new->enable(new) == 0) { 635 /*
631 old = tk->clock; 636 * If the cs is in module, get a module reference. Succeeds
632 tk_setup_internals(tk, new); 637 * for built-in code (owner == NULL) as well.
633 if (old->disable) 638 */
634 old->disable(old); 639 if (try_module_get(new->owner)) {
640 if (!new->enable || new->enable(new) == 0) {
641 old = tk->clock;
642 tk_setup_internals(tk, new);
643 if (old->disable)
644 old->disable(old);
645 module_put(old->owner);
646 } else {
647 module_put(new->owner);
648 }
635 } 649 }
636 timekeeping_update(tk, true, true); 650 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
637 651
638 write_seqcount_end(&timekeeper_seq); 652 write_seqcount_end(&timekeeper_seq);
639 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 653 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -648,14 +662,15 @@ static int change_clocksource(void *data)
648 * This function is called from clocksource.c after a new, better clock 662 * This function is called from clocksource.c after a new, better clock
649 * source has been registered. The caller holds the clocksource_mutex. 663 * source has been registered. The caller holds the clocksource_mutex.
650 */ 664 */
651void timekeeping_notify(struct clocksource *clock) 665int timekeeping_notify(struct clocksource *clock)
652{ 666{
653 struct timekeeper *tk = &timekeeper; 667 struct timekeeper *tk = &timekeeper;
654 668
655 if (tk->clock == clock) 669 if (tk->clock == clock)
656 return; 670 return 0;
657 stop_machine(change_clocksource, clock, NULL); 671 stop_machine(change_clocksource, clock, NULL);
658 tick_clock_notify(); 672 tick_clock_notify();
673 return tk->clock == clock ? 0 : -1;
659} 674}
660 675
661/** 676/**
@@ -841,6 +856,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
841 tk_xtime_add(tk, delta); 856 tk_xtime_add(tk, delta);
842 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); 857 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta));
843 tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); 858 tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta));
859 tk_debug_account_sleep_time(delta);
844} 860}
845 861
846/** 862/**
@@ -872,7 +888,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
872 888
873 __timekeeping_inject_sleeptime(tk, delta); 889 __timekeeping_inject_sleeptime(tk, delta);
874 890
875 timekeeping_update(tk, true, true); 891 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
876 892
877 write_seqcount_end(&timekeeper_seq); 893 write_seqcount_end(&timekeeper_seq);
878 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 894 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -954,7 +970,7 @@ static void timekeeping_resume(void)
954 tk->cycle_last = clock->cycle_last = cycle_now; 970 tk->cycle_last = clock->cycle_last = cycle_now;
955 tk->ntp_error = 0; 971 tk->ntp_error = 0;
956 timekeeping_suspended = 0; 972 timekeeping_suspended = 0;
957 timekeeping_update(tk, false, true); 973 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
958 write_seqcount_end(&timekeeper_seq); 974 write_seqcount_end(&timekeeper_seq);
959 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 975 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
960 976
@@ -1236,9 +1252,10 @@ out_adjust:
1236 * It also calls into the NTP code to handle leapsecond processing. 1252 * It also calls into the NTP code to handle leapsecond processing.
1237 * 1253 *
1238 */ 1254 */
1239static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) 1255static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
1240{ 1256{
1241 u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; 1257 u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
1258 unsigned int action = 0;
1242 1259
1243 while (tk->xtime_nsec >= nsecps) { 1260 while (tk->xtime_nsec >= nsecps) {
1244 int leap; 1261 int leap;
@@ -1261,8 +1278,10 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
1261 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); 1278 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
1262 1279
1263 clock_was_set_delayed(); 1280 clock_was_set_delayed();
1281 action = TK_CLOCK_WAS_SET;
1264 } 1282 }
1265 } 1283 }
1284 return action;
1266} 1285}
1267 1286
1268/** 1287/**
@@ -1347,6 +1366,7 @@ static void update_wall_time(void)
1347 struct timekeeper *tk = &shadow_timekeeper; 1366 struct timekeeper *tk = &shadow_timekeeper;
1348 cycle_t offset; 1367 cycle_t offset;
1349 int shift = 0, maxshift; 1368 int shift = 0, maxshift;
1369 unsigned int action;
1350 unsigned long flags; 1370 unsigned long flags;
1351 1371
1352 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1372 raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -1399,7 +1419,7 @@ static void update_wall_time(void)
1399 * Finally, make sure that after the rounding 1419 * Finally, make sure that after the rounding
1400 * xtime_nsec isn't larger than NSEC_PER_SEC 1420 * xtime_nsec isn't larger than NSEC_PER_SEC
1401 */ 1421 */
1402 accumulate_nsecs_to_secs(tk); 1422 action = accumulate_nsecs_to_secs(tk);
1403 1423
1404 write_seqcount_begin(&timekeeper_seq); 1424 write_seqcount_begin(&timekeeper_seq);
1405 /* Update clock->cycle_last with the new value */ 1425 /* Update clock->cycle_last with the new value */
@@ -1415,7 +1435,7 @@ static void update_wall_time(void)
1415 * updating. 1435 * updating.
1416 */ 1436 */
1417 memcpy(real_tk, tk, sizeof(*tk)); 1437 memcpy(real_tk, tk, sizeof(*tk));
1418 timekeeping_update(real_tk, false, false); 1438 timekeeping_update(real_tk, action);
1419 write_seqcount_end(&timekeeper_seq); 1439 write_seqcount_end(&timekeeper_seq);
1420out: 1440out:
1421 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1441 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -1677,6 +1697,7 @@ int do_adjtimex(struct timex *txc)
1677 1697
1678 if (tai != orig_tai) { 1698 if (tai != orig_tai) {
1679 __timekeeping_set_tai_offset(tk, tai); 1699 __timekeeping_set_tai_offset(tk, tai);
1700 update_pvclock_gtod(tk, true);
1680 clock_was_set_delayed(); 1701 clock_was_set_delayed();
1681 } 1702 }
1682 write_seqcount_end(&timekeeper_seq); 1703 write_seqcount_end(&timekeeper_seq);
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
new file mode 100644
index 000000000000..802433a4f5eb
--- /dev/null
+++ b/kernel/time/timekeeping_debug.c
@@ -0,0 +1,72 @@
1/*
2 * debugfs file to track time spent in suspend
3 *
4 * Copyright (c) 2011, Google, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 */
16
17#include <linux/debugfs.h>
18#include <linux/err.h>
19#include <linux/init.h>
20#include <linux/kernel.h>
21#include <linux/seq_file.h>
22#include <linux/time.h>
23
24static unsigned int sleep_time_bin[32] = {0};
25
26static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
27{
28 unsigned int bin;
29 seq_puts(s, " time (secs) count\n");
30 seq_puts(s, "------------------------------\n");
31 for (bin = 0; bin < 32; bin++) {
32 if (sleep_time_bin[bin] == 0)
33 continue;
34 seq_printf(s, "%10u - %-10u %4u\n",
35 bin ? 1 << (bin - 1) : 0, 1 << bin,
36 sleep_time_bin[bin]);
37 }
38 return 0;
39}
40
41static int tk_debug_sleep_time_open(struct inode *inode, struct file *file)
42{
43 return single_open(file, tk_debug_show_sleep_time, NULL);
44}
45
46static const struct file_operations tk_debug_sleep_time_fops = {
47 .open = tk_debug_sleep_time_open,
48 .read = seq_read,
49 .llseek = seq_lseek,
50 .release = single_release,
51};
52
53static int __init tk_debug_sleep_time_init(void)
54{
55 struct dentry *d;
56
57 d = debugfs_create_file("sleep_time", 0444, NULL, NULL,
58 &tk_debug_sleep_time_fops);
59 if (!d) {
60 pr_err("Failed to create sleep_time debug file\n");
61 return -ENOMEM;
62 }
63
64 return 0;
65}
66late_initcall(tk_debug_sleep_time_init);
67
68void tk_debug_account_sleep_time(struct timespec *t)
69{
70 sleep_time_bin[fls(t->tv_sec)]++;
71}
72
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
new file mode 100644
index 000000000000..13323ea08ffa
--- /dev/null
+++ b/kernel/time/timekeeping_internal.h
@@ -0,0 +1,14 @@
1#ifndef _TIMEKEEPING_INTERNAL_H
2#define _TIMEKEEPING_INTERNAL_H
3/*
4 * timekeeping debug functions
5 */
6#include <linux/time.h>
7
8#ifdef CONFIG_DEBUG_FS
9extern void tk_debug_account_sleep_time(struct timespec *t);
10#else
11#define tk_debug_account_sleep_time(x)
12#endif
13
14#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/timer.c b/kernel/timer.c
index 15ffdb3f1948..4296d13db3d1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -149,9 +149,11 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu,
149 /* now that we have rounded, subtract the extra skew again */ 149 /* now that we have rounded, subtract the extra skew again */
150 j -= cpu * 3; 150 j -= cpu * 3;
151 151
152 if (j <= jiffies) /* rounding ate our timeout entirely; */ 152 /*
153 return original; 153 * Make sure j is still in the future. Otherwise return the
154 return j; 154 * unmodified value.
155 */
156 return time_is_after_jiffies(j) ? j : original;
155} 157}
156 158
157/** 159/**
@@ -1503,11 +1505,11 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1503} 1505}
1504EXPORT_SYMBOL(schedule_timeout_uninterruptible); 1506EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1505 1507
1506static int __cpuinit init_timers_cpu(int cpu) 1508static int init_timers_cpu(int cpu)
1507{ 1509{
1508 int j; 1510 int j;
1509 struct tvec_base *base; 1511 struct tvec_base *base;
1510 static char __cpuinitdata tvec_base_done[NR_CPUS]; 1512 static char tvec_base_done[NR_CPUS];
1511 1513
1512 if (!tvec_base_done[cpu]) { 1514 if (!tvec_base_done[cpu]) {
1513 static char boot_done; 1515 static char boot_done;
@@ -1575,7 +1577,7 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
1575 } 1577 }
1576} 1578}
1577 1579
1578static void __cpuinit migrate_timers(int cpu) 1580static void migrate_timers(int cpu)
1579{ 1581{
1580 struct tvec_base *old_base; 1582 struct tvec_base *old_base;
1581 struct tvec_base *new_base; 1583 struct tvec_base *new_base;
@@ -1608,7 +1610,7 @@ static void __cpuinit migrate_timers(int cpu)
1608} 1610}
1609#endif /* CONFIG_HOTPLUG_CPU */ 1611#endif /* CONFIG_HOTPLUG_CPU */
1610 1612
1611static int __cpuinit timer_cpu_notify(struct notifier_block *self, 1613static int timer_cpu_notify(struct notifier_block *self,
1612 unsigned long action, void *hcpu) 1614 unsigned long action, void *hcpu)
1613{ 1615{
1614 long cpu = (long)hcpu; 1616 long cpu = (long)hcpu;
@@ -1633,7 +1635,7 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
1633 return NOTIFY_OK; 1635 return NOTIFY_OK;
1634} 1636}
1635 1637
1636static struct notifier_block __cpuinitdata timers_nb = { 1638static struct notifier_block timers_nb = {
1637 .notifier_call = timer_cpu_notify, 1639 .notifier_call = timer_cpu_notify,
1638}; 1640};
1639 1641
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6c508ff33c62..a6d098c6df3f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -413,6 +413,17 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
413 return 0; 413 return 0;
414} 414}
415 415
416static void ftrace_sync(struct work_struct *work)
417{
418 /*
419 * This function is just a stub to implement a hard force
420 * of synchronize_sched(). This requires synchronizing
421 * tasks even in userspace and idle.
422 *
423 * Yes, function tracing is rude.
424 */
425}
426
416static int __unregister_ftrace_function(struct ftrace_ops *ops) 427static int __unregister_ftrace_function(struct ftrace_ops *ops)
417{ 428{
418 int ret; 429 int ret;
@@ -440,8 +451,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
440 * so there'll be no new users. We must ensure 451 * so there'll be no new users. We must ensure
441 * all current users are done before we free 452 * all current users are done before we free
442 * the control data. 453 * the control data.
454 * Note synchronize_sched() is not enough, as we
455 * use preempt_disable() to do RCU, but the function
456 * tracer can be called where RCU is not active
457 * (before user_exit()).
443 */ 458 */
444 synchronize_sched(); 459 schedule_on_each_cpu(ftrace_sync);
445 control_ops_free(ops); 460 control_ops_free(ops);
446 } 461 }
447 } else 462 } else
@@ -456,9 +471,13 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
456 /* 471 /*
457 * Dynamic ops may be freed, we must make sure that all 472 * Dynamic ops may be freed, we must make sure that all
458 * callers are done before leaving this function. 473 * callers are done before leaving this function.
474 *
475 * Again, normal synchronize_sched() is not good enough.
476 * We need to do a hard force of sched synchronization.
459 */ 477 */
460 if (ops->flags & FTRACE_OPS_FL_DYNAMIC) 478 if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
461 synchronize_sched(); 479 schedule_on_each_cpu(ftrace_sync);
480
462 481
463 return 0; 482 return 0;
464} 483}
@@ -622,12 +641,18 @@ static int function_stat_show(struct seq_file *m, void *v)
622 if (rec->counter <= 1) 641 if (rec->counter <= 1)
623 stddev = 0; 642 stddev = 0;
624 else { 643 else {
625 stddev = rec->time_squared - rec->counter * avg * avg; 644 /*
645 * Apply Welford's method:
646 * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
647 */
648 stddev = rec->counter * rec->time_squared -
649 rec->time * rec->time;
650
626 /* 651 /*
627 * Divide only 1000 for ns^2 -> us^2 conversion. 652 * Divide only 1000 for ns^2 -> us^2 conversion.
628 * trace_print_graph_duration will divide 1000 again. 653 * trace_print_graph_duration will divide 1000 again.
629 */ 654 */
630 do_div(stddev, (rec->counter - 1) * 1000); 655 do_div(stddev, rec->counter * (rec->counter - 1) * 1000);
631 } 656 }
632 657
633 trace_seq_init(&s); 658 trace_seq_init(&s);
@@ -1416,12 +1441,22 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1416 * the hashes are freed with call_rcu_sched(). 1441 * the hashes are freed with call_rcu_sched().
1417 */ 1442 */
1418static int 1443static int
1419ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) 1444ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
1420{ 1445{
1421 struct ftrace_hash *filter_hash; 1446 struct ftrace_hash *filter_hash;
1422 struct ftrace_hash *notrace_hash; 1447 struct ftrace_hash *notrace_hash;
1423 int ret; 1448 int ret;
1424 1449
1450#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
1451 /*
1452 * There's a small race when adding ops that the ftrace handler
1453 * that wants regs, may be called without them. We can not
1454 * allow that handler to be called if regs is NULL.
1455 */
1456 if (regs == NULL && (ops->flags & FTRACE_OPS_FL_SAVE_REGS))
1457 return 0;
1458#endif
1459
1425 filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); 1460 filter_hash = rcu_dereference_raw_notrace(ops->filter_hash);
1426 notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); 1461 notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash);
1427 1462
@@ -2134,12 +2169,57 @@ static cycle_t ftrace_update_time;
2134static unsigned long ftrace_update_cnt; 2169static unsigned long ftrace_update_cnt;
2135unsigned long ftrace_update_tot_cnt; 2170unsigned long ftrace_update_tot_cnt;
2136 2171
2137static int ops_traces_mod(struct ftrace_ops *ops) 2172static inline int ops_traces_mod(struct ftrace_ops *ops)
2138{ 2173{
2139 struct ftrace_hash *hash; 2174 /*
2175 * Filter_hash being empty will default to trace module.
2176 * But notrace hash requires a test of individual module functions.
2177 */
2178 return ftrace_hash_empty(ops->filter_hash) &&
2179 ftrace_hash_empty(ops->notrace_hash);
2180}
2181
2182/*
2183 * Check if the current ops references the record.
2184 *
2185 * If the ops traces all functions, then it was already accounted for.
2186 * If the ops does not trace the current record function, skip it.
2187 * If the ops ignores the function via notrace filter, skip it.
2188 */
2189static inline bool
2190ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
2191{
2192 /* If ops isn't enabled, ignore it */
2193 if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
2194 return 0;
2140 2195
2141 hash = ops->filter_hash; 2196 /* If ops traces all mods, we already accounted for it */
2142 return ftrace_hash_empty(hash); 2197 if (ops_traces_mod(ops))
2198 return 0;
2199
2200 /* The function must be in the filter */
2201 if (!ftrace_hash_empty(ops->filter_hash) &&
2202 !ftrace_lookup_ip(ops->filter_hash, rec->ip))
2203 return 0;
2204
2205 /* If in notrace hash, we ignore it too */
2206 if (ftrace_lookup_ip(ops->notrace_hash, rec->ip))
2207 return 0;
2208
2209 return 1;
2210}
2211
2212static int referenced_filters(struct dyn_ftrace *rec)
2213{
2214 struct ftrace_ops *ops;
2215 int cnt = 0;
2216
2217 for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
2218 if (ops_references_rec(ops, rec))
2219 cnt++;
2220 }
2221
2222 return cnt;
2143} 2223}
2144 2224
2145static int ftrace_update_code(struct module *mod) 2225static int ftrace_update_code(struct module *mod)
@@ -2148,6 +2228,7 @@ static int ftrace_update_code(struct module *mod)
2148 struct dyn_ftrace *p; 2228 struct dyn_ftrace *p;
2149 cycle_t start, stop; 2229 cycle_t start, stop;
2150 unsigned long ref = 0; 2230 unsigned long ref = 0;
2231 bool test = false;
2151 int i; 2232 int i;
2152 2233
2153 /* 2234 /*
@@ -2161,9 +2242,12 @@ static int ftrace_update_code(struct module *mod)
2161 2242
2162 for (ops = ftrace_ops_list; 2243 for (ops = ftrace_ops_list;
2163 ops != &ftrace_list_end; ops = ops->next) { 2244 ops != &ftrace_list_end; ops = ops->next) {
2164 if (ops->flags & FTRACE_OPS_FL_ENABLED && 2245 if (ops->flags & FTRACE_OPS_FL_ENABLED) {
2165 ops_traces_mod(ops)) 2246 if (ops_traces_mod(ops))
2166 ref++; 2247 ref++;
2248 else
2249 test = true;
2250 }
2167 } 2251 }
2168 } 2252 }
2169 2253
@@ -2173,12 +2257,16 @@ static int ftrace_update_code(struct module *mod)
2173 for (pg = ftrace_new_pgs; pg; pg = pg->next) { 2257 for (pg = ftrace_new_pgs; pg; pg = pg->next) {
2174 2258
2175 for (i = 0; i < pg->index; i++) { 2259 for (i = 0; i < pg->index; i++) {
2260 int cnt = ref;
2261
2176 /* If something went wrong, bail without enabling anything */ 2262 /* If something went wrong, bail without enabling anything */
2177 if (unlikely(ftrace_disabled)) 2263 if (unlikely(ftrace_disabled))
2178 return -1; 2264 return -1;
2179 2265
2180 p = &pg->records[i]; 2266 p = &pg->records[i];
2181 p->flags = ref; 2267 if (test)
2268 cnt += referenced_filters(p);
2269 p->flags = cnt;
2182 2270
2183 /* 2271 /*
2184 * Do the initial record conversion from mcount jump 2272 * Do the initial record conversion from mcount jump
@@ -2198,7 +2286,7 @@ static int ftrace_update_code(struct module *mod)
2198 * conversion puts the module to the correct state, thus 2286 * conversion puts the module to the correct state, thus
2199 * passing the ftrace_make_call check. 2287 * passing the ftrace_make_call check.
2200 */ 2288 */
2201 if (ftrace_start_up && ref) { 2289 if (ftrace_start_up && cnt) {
2202 int failed = __ftrace_replace_code(p, 1); 2290 int failed = __ftrace_replace_code(p, 1);
2203 if (failed) 2291 if (failed)
2204 ftrace_bug(failed, p->ip); 2292 ftrace_bug(failed, p->ip);
@@ -3349,6 +3437,12 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
3349 return add_hash_entry(hash, ip); 3437 return add_hash_entry(hash, ip);
3350} 3438}
3351 3439
3440static void ftrace_ops_update_code(struct ftrace_ops *ops)
3441{
3442 if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled)
3443 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
3444}
3445
3352static int 3446static int
3353ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, 3447ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3354 unsigned long ip, int remove, int reset, int enable) 3448 unsigned long ip, int remove, int reset, int enable)
@@ -3391,9 +3485,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3391 3485
3392 mutex_lock(&ftrace_lock); 3486 mutex_lock(&ftrace_lock);
3393 ret = ftrace_hash_move(ops, enable, orig_hash, hash); 3487 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
3394 if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED 3488 if (!ret)
3395 && ftrace_enabled) 3489 ftrace_ops_update_code(ops);
3396 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
3397 3490
3398 mutex_unlock(&ftrace_lock); 3491 mutex_unlock(&ftrace_lock);
3399 3492
@@ -3512,8 +3605,12 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);
3512static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; 3605static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
3513static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; 3606static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
3514 3607
3608/* Used by function selftest to not test if filter is set */
3609bool ftrace_filter_param __initdata;
3610
3515static int __init set_ftrace_notrace(char *str) 3611static int __init set_ftrace_notrace(char *str)
3516{ 3612{
3613 ftrace_filter_param = true;
3517 strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); 3614 strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
3518 return 1; 3615 return 1;
3519} 3616}
@@ -3521,6 +3618,7 @@ __setup("ftrace_notrace=", set_ftrace_notrace);
3521 3618
3522static int __init set_ftrace_filter(char *str) 3619static int __init set_ftrace_filter(char *str)
3523{ 3620{
3621 ftrace_filter_param = true;
3524 strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); 3622 strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
3525 return 1; 3623 return 1;
3526} 3624}
@@ -3615,9 +3713,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
3615 mutex_lock(&ftrace_lock); 3713 mutex_lock(&ftrace_lock);
3616 ret = ftrace_hash_move(iter->ops, filter_hash, 3714 ret = ftrace_hash_move(iter->ops, filter_hash,
3617 orig_hash, iter->hash); 3715 orig_hash, iter->hash);
3618 if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) 3716 if (!ret)
3619 && ftrace_enabled) 3717 ftrace_ops_update_code(iter->ops);
3620 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
3621 3718
3622 mutex_unlock(&ftrace_lock); 3719 mutex_unlock(&ftrace_lock);
3623 } 3720 }
@@ -4188,7 +4285,7 @@ static inline void ftrace_startup_enable(int command) { }
4188# define ftrace_shutdown_sysctl() do { } while (0) 4285# define ftrace_shutdown_sysctl() do { } while (0)
4189 4286
4190static inline int 4287static inline int
4191ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) 4288ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
4192{ 4289{
4193 return 1; 4290 return 1;
4194} 4291}
@@ -4211,7 +4308,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4211 do_for_each_ftrace_op(op, ftrace_control_list) { 4308 do_for_each_ftrace_op(op, ftrace_control_list) {
4212 if (!(op->flags & FTRACE_OPS_FL_STUB) && 4309 if (!(op->flags & FTRACE_OPS_FL_STUB) &&
4213 !ftrace_function_local_disabled(op) && 4310 !ftrace_function_local_disabled(op) &&
4214 ftrace_ops_test(op, ip)) 4311 ftrace_ops_test(op, ip, regs))
4215 op->func(ip, parent_ip, op, regs); 4312 op->func(ip, parent_ip, op, regs);
4216 } while_for_each_ftrace_op(op); 4313 } while_for_each_ftrace_op(op);
4217 trace_recursion_clear(TRACE_CONTROL_BIT); 4314 trace_recursion_clear(TRACE_CONTROL_BIT);
@@ -4244,7 +4341,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4244 */ 4341 */
4245 preempt_disable_notrace(); 4342 preempt_disable_notrace();
4246 do_for_each_ftrace_op(op, ftrace_ops_list) { 4343 do_for_each_ftrace_op(op, ftrace_ops_list) {
4247 if (ftrace_ops_test(op, ip)) 4344 if (ftrace_ops_test(op, ip, regs))
4248 op->func(ip, parent_ip, op, regs); 4345 op->func(ip, parent_ip, op, regs);
4249 } while_for_each_ftrace_op(op); 4346 } while_for_each_ftrace_op(op);
4250 preempt_enable_notrace(); 4347 preempt_enable_notrace();
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e444ff88f0a4..cc2f66f68dc5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -36,11 +36,11 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
36{ 36{
37 int ret; 37 int ret;
38 38
39 ret = trace_seq_printf(s, "# compressed entry header\n"); 39 ret = trace_seq_puts(s, "# compressed entry header\n");
40 ret = trace_seq_printf(s, "\ttype_len : 5 bits\n"); 40 ret = trace_seq_puts(s, "\ttype_len : 5 bits\n");
41 ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); 41 ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n");
42 ret = trace_seq_printf(s, "\tarray : 32 bits\n"); 42 ret = trace_seq_puts(s, "\tarray : 32 bits\n");
43 ret = trace_seq_printf(s, "\n"); 43 ret = trace_seq_putc(s, '\n');
44 ret = trace_seq_printf(s, "\tpadding : type == %d\n", 44 ret = trace_seq_printf(s, "\tpadding : type == %d\n",
45 RINGBUF_TYPE_PADDING); 45 RINGBUF_TYPE_PADDING);
46 ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", 46 ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
@@ -1066,7 +1066,7 @@ static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
1066} 1066}
1067 1067
1068/** 1068/**
1069 * check_pages - integrity check of buffer pages 1069 * rb_check_pages - integrity check of buffer pages
1070 * @cpu_buffer: CPU buffer with pages to test 1070 * @cpu_buffer: CPU buffer with pages to test
1071 * 1071 *
1072 * As a safety measure we check to make sure the data pages have not 1072 * As a safety measure we check to make sure the data pages have not
@@ -1258,7 +1258,7 @@ static int rb_cpu_notify(struct notifier_block *self,
1258#endif 1258#endif
1259 1259
1260/** 1260/**
1261 * ring_buffer_alloc - allocate a new ring_buffer 1261 * __ring_buffer_alloc - allocate a new ring_buffer
1262 * @size: the size in bytes per cpu that is needed. 1262 * @size: the size in bytes per cpu that is needed.
1263 * @flags: attributes to set for the ring buffer. 1263 * @flags: attributes to set for the ring buffer.
1264 * 1264 *
@@ -1607,6 +1607,7 @@ static void update_pages_handler(struct work_struct *work)
1607 * ring_buffer_resize - resize the ring buffer 1607 * ring_buffer_resize - resize the ring buffer
1608 * @buffer: the buffer to resize. 1608 * @buffer: the buffer to resize.
1609 * @size: the new size. 1609 * @size: the new size.
1610 * @cpu_id: the cpu buffer to resize
1610 * 1611 *
1611 * Minimum size is 2 * BUF_PAGE_SIZE. 1612 * Minimum size is 2 * BUF_PAGE_SIZE.
1612 * 1613 *
@@ -3956,11 +3957,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
3956 * expected. 3957 * expected.
3957 * 3958 *
3958 * After a sequence of ring_buffer_read_prepare calls, the user is 3959 * After a sequence of ring_buffer_read_prepare calls, the user is
3959 * expected to make at least one call to ring_buffer_prepare_sync. 3960 * expected to make at least one call to ring_buffer_read_prepare_sync.
3960 * Afterwards, ring_buffer_read_start is invoked to get things going 3961 * Afterwards, ring_buffer_read_start is invoked to get things going
3961 * for real. 3962 * for real.
3962 * 3963 *
3963 * This overall must be paired with ring_buffer_finish. 3964 * This overall must be paired with ring_buffer_read_finish.
3964 */ 3965 */
3965struct ring_buffer_iter * 3966struct ring_buffer_iter *
3966ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) 3967ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
@@ -4009,7 +4010,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
4009 * an intervening ring_buffer_read_prepare_sync must have been 4010 * an intervening ring_buffer_read_prepare_sync must have been
4010 * performed. 4011 * performed.
4011 * 4012 *
4012 * Must be paired with ring_buffer_finish. 4013 * Must be paired with ring_buffer_read_finish.
4013 */ 4014 */
4014void 4015void
4015ring_buffer_read_start(struct ring_buffer_iter *iter) 4016ring_buffer_read_start(struct ring_buffer_iter *iter)
@@ -4031,7 +4032,7 @@ ring_buffer_read_start(struct ring_buffer_iter *iter)
4031EXPORT_SYMBOL_GPL(ring_buffer_read_start); 4032EXPORT_SYMBOL_GPL(ring_buffer_read_start);
4032 4033
4033/** 4034/**
4034 * ring_buffer_finish - finish reading the iterator of the buffer 4035 * ring_buffer_read_finish - finish reading the iterator of the buffer
4035 * @iter: The iterator retrieved by ring_buffer_start 4036 * @iter: The iterator retrieved by ring_buffer_start
4036 * 4037 *
4037 * This re-enables the recording to the buffer, and frees the 4038 * This re-enables the recording to the buffer, and frees the
@@ -4346,6 +4347,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
4346/** 4347/**
4347 * ring_buffer_alloc_read_page - allocate a page to read from buffer 4348 * ring_buffer_alloc_read_page - allocate a page to read from buffer
4348 * @buffer: the buffer to allocate for. 4349 * @buffer: the buffer to allocate for.
4350 * @cpu: the cpu buffer to allocate.
4349 * 4351 *
4350 * This function is used in conjunction with ring_buffer_read_page. 4352 * This function is used in conjunction with ring_buffer_read_page.
4351 * When reading a full page from the ring buffer, these functions 4353 * When reading a full page from the ring buffer, these functions
@@ -4403,7 +4405,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
4403 * to swap with a page in the ring buffer. 4405 * to swap with a page in the ring buffer.
4404 * 4406 *
4405 * for example: 4407 * for example:
4406 * rpage = ring_buffer_alloc_read_page(buffer); 4408 * rpage = ring_buffer_alloc_read_page(buffer, cpu);
4407 * if (!rpage) 4409 * if (!rpage)
4408 * return error; 4410 * return error;
4409 * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); 4411 * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e71a8be4a6ee..496f94d57698 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -115,6 +115,9 @@ cpumask_var_t __read_mostly tracing_buffer_mask;
115 115
116enum ftrace_dump_mode ftrace_dump_on_oops; 116enum ftrace_dump_mode ftrace_dump_on_oops;
117 117
118/* When set, tracing will stop when a WARN*() is hit */
119int __disable_trace_on_warning;
120
118static int tracing_set_tracer(const char *buf); 121static int tracing_set_tracer(const char *buf);
119 122
120#define MAX_TRACER_SIZE 100 123#define MAX_TRACER_SIZE 100
@@ -149,6 +152,13 @@ static int __init set_ftrace_dump_on_oops(char *str)
149} 152}
150__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 153__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
151 154
155static int __init stop_trace_on_warning(char *str)
156{
157 __disable_trace_on_warning = 1;
158 return 1;
159}
160__setup("traceoff_on_warning=", stop_trace_on_warning);
161
152static int __init boot_alloc_snapshot(char *str) 162static int __init boot_alloc_snapshot(char *str)
153{ 163{
154 allocate_snapshot = true; 164 allocate_snapshot = true;
@@ -170,6 +180,7 @@ static int __init set_trace_boot_options(char *str)
170} 180}
171__setup("trace_options=", set_trace_boot_options); 181__setup("trace_options=", set_trace_boot_options);
172 182
183
173unsigned long long ns2usecs(cycle_t nsec) 184unsigned long long ns2usecs(cycle_t nsec)
174{ 185{
175 nsec += 500; 186 nsec += 500;
@@ -193,6 +204,37 @@ static struct trace_array global_trace;
193 204
194LIST_HEAD(ftrace_trace_arrays); 205LIST_HEAD(ftrace_trace_arrays);
195 206
207int trace_array_get(struct trace_array *this_tr)
208{
209 struct trace_array *tr;
210 int ret = -ENODEV;
211
212 mutex_lock(&trace_types_lock);
213 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
214 if (tr == this_tr) {
215 tr->ref++;
216 ret = 0;
217 break;
218 }
219 }
220 mutex_unlock(&trace_types_lock);
221
222 return ret;
223}
224
225static void __trace_array_put(struct trace_array *this_tr)
226{
227 WARN_ON(!this_tr->ref);
228 this_tr->ref--;
229}
230
231void trace_array_put(struct trace_array *this_tr)
232{
233 mutex_lock(&trace_types_lock);
234 __trace_array_put(this_tr);
235 mutex_unlock(&trace_types_lock);
236}
237
196int filter_current_check_discard(struct ring_buffer *buffer, 238int filter_current_check_discard(struct ring_buffer *buffer,
197 struct ftrace_event_call *call, void *rec, 239 struct ftrace_event_call *call, void *rec,
198 struct ring_buffer_event *event) 240 struct ring_buffer_event *event)
@@ -201,23 +243,43 @@ int filter_current_check_discard(struct ring_buffer *buffer,
201} 243}
202EXPORT_SYMBOL_GPL(filter_current_check_discard); 244EXPORT_SYMBOL_GPL(filter_current_check_discard);
203 245
204cycle_t ftrace_now(int cpu) 246cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
205{ 247{
206 u64 ts; 248 u64 ts;
207 249
208 /* Early boot up does not have a buffer yet */ 250 /* Early boot up does not have a buffer yet */
209 if (!global_trace.trace_buffer.buffer) 251 if (!buf->buffer)
210 return trace_clock_local(); 252 return trace_clock_local();
211 253
212 ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu); 254 ts = ring_buffer_time_stamp(buf->buffer, cpu);
213 ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts); 255 ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts);
214 256
215 return ts; 257 return ts;
216} 258}
217 259
260cycle_t ftrace_now(int cpu)
261{
262 return buffer_ftrace_now(&global_trace.trace_buffer, cpu);
263}
264
265/**
266 * tracing_is_enabled - Show if global_trace has been disabled
267 *
268 * Shows if the global trace has been enabled or not. It uses the
269 * mirror flag "buffer_disabled" to be used in fast paths such as for
270 * the irqsoff tracer. But it may be inaccurate due to races. If you
271 * need to know the accurate state, use tracing_is_on() which is a little
272 * slower, but accurate.
273 */
218int tracing_is_enabled(void) 274int tracing_is_enabled(void)
219{ 275{
220 return tracing_is_on(); 276 /*
277 * For quick access (irqsoff uses this in fast path), just
278 * return the mirror variable of the state of the ring buffer.
279 * It's a little racy, but we don't really care.
280 */
281 smp_rmb();
282 return !global_trace.buffer_disabled;
221} 283}
222 284
223/* 285/*
@@ -240,7 +302,7 @@ static struct tracer *trace_types __read_mostly;
240/* 302/*
241 * trace_types_lock is used to protect the trace_types list. 303 * trace_types_lock is used to protect the trace_types list.
242 */ 304 */
243static DEFINE_MUTEX(trace_types_lock); 305DEFINE_MUTEX(trace_types_lock);
244 306
245/* 307/*
246 * serialize the access of the ring buffer 308 * serialize the access of the ring buffer
@@ -330,6 +392,23 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
330 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | 392 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
331 TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; 393 TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION;
332 394
395static void tracer_tracing_on(struct trace_array *tr)
396{
397 if (tr->trace_buffer.buffer)
398 ring_buffer_record_on(tr->trace_buffer.buffer);
399 /*
400 * This flag is looked at when buffers haven't been allocated
401 * yet, or by some tracers (like irqsoff), that just want to
402 * know if the ring buffer has been disabled, but it can handle
403 * races of where it gets disabled but we still do a record.
404 * As the check is in the fast path of the tracers, it is more
405 * important to be fast than accurate.
406 */
407 tr->buffer_disabled = 0;
408 /* Make the flag seen by readers */
409 smp_wmb();
410}
411
333/** 412/**
334 * tracing_on - enable tracing buffers 413 * tracing_on - enable tracing buffers
335 * 414 *
@@ -338,15 +417,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
338 */ 417 */
339void tracing_on(void) 418void tracing_on(void)
340{ 419{
341 if (global_trace.trace_buffer.buffer) 420 tracer_tracing_on(&global_trace);
342 ring_buffer_record_on(global_trace.trace_buffer.buffer);
343 /*
344 * This flag is only looked at when buffers haven't been
345 * allocated yet. We don't really care about the race
346 * between setting this flag and actually turning
347 * on the buffer.
348 */
349 global_trace.buffer_disabled = 0;
350} 421}
351EXPORT_SYMBOL_GPL(tracing_on); 422EXPORT_SYMBOL_GPL(tracing_on);
352 423
@@ -540,6 +611,23 @@ void tracing_snapshot_alloc(void)
540EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); 611EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
541#endif /* CONFIG_TRACER_SNAPSHOT */ 612#endif /* CONFIG_TRACER_SNAPSHOT */
542 613
614static void tracer_tracing_off(struct trace_array *tr)
615{
616 if (tr->trace_buffer.buffer)
617 ring_buffer_record_off(tr->trace_buffer.buffer);
618 /*
619 * This flag is looked at when buffers haven't been allocated
620 * yet, or by some tracers (like irqsoff), that just want to
621 * know if the ring buffer has been disabled, but it can handle
622 * races of where it gets disabled but we still do a record.
623 * As the check is in the fast path of the tracers, it is more
624 * important to be fast than accurate.
625 */
626 tr->buffer_disabled = 1;
627 /* Make the flag seen by readers */
628 smp_wmb();
629}
630
543/** 631/**
544 * tracing_off - turn off tracing buffers 632 * tracing_off - turn off tracing buffers
545 * 633 *
@@ -550,26 +638,35 @@ EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
550 */ 638 */
551void tracing_off(void) 639void tracing_off(void)
552{ 640{
553 if (global_trace.trace_buffer.buffer) 641 tracer_tracing_off(&global_trace);
554 ring_buffer_record_off(global_trace.trace_buffer.buffer);
555 /*
556 * This flag is only looked at when buffers haven't been
557 * allocated yet. We don't really care about the race
558 * between setting this flag and actually turning
559 * on the buffer.
560 */
561 global_trace.buffer_disabled = 1;
562} 642}
563EXPORT_SYMBOL_GPL(tracing_off); 643EXPORT_SYMBOL_GPL(tracing_off);
564 644
645void disable_trace_on_warning(void)
646{
647 if (__disable_trace_on_warning)
648 tracing_off();
649}
650
651/**
652 * tracer_tracing_is_on - show real state of ring buffer enabled
653 * @tr : the trace array to know if ring buffer is enabled
654 *
655 * Shows real state of the ring buffer if it is enabled or not.
656 */
657static int tracer_tracing_is_on(struct trace_array *tr)
658{
659 if (tr->trace_buffer.buffer)
660 return ring_buffer_record_is_on(tr->trace_buffer.buffer);
661 return !tr->buffer_disabled;
662}
663
565/** 664/**
566 * tracing_is_on - show state of ring buffers enabled 665 * tracing_is_on - show state of ring buffers enabled
567 */ 666 */
568int tracing_is_on(void) 667int tracing_is_on(void)
569{ 668{
570 if (global_trace.trace_buffer.buffer) 669 return tracer_tracing_is_on(&global_trace);
571 return ring_buffer_record_is_on(global_trace.trace_buffer.buffer);
572 return !global_trace.buffer_disabled;
573} 670}
574EXPORT_SYMBOL_GPL(tracing_is_on); 671EXPORT_SYMBOL_GPL(tracing_is_on);
575 672
@@ -1119,7 +1216,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf)
1119 /* Make sure all commits have finished */ 1216 /* Make sure all commits have finished */
1120 synchronize_sched(); 1217 synchronize_sched();
1121 1218
1122 buf->time_start = ftrace_now(buf->cpu); 1219 buf->time_start = buffer_ftrace_now(buf, buf->cpu);
1123 1220
1124 for_each_online_cpu(cpu) 1221 for_each_online_cpu(cpu)
1125 ring_buffer_reset_cpu(buffer, cpu); 1222 ring_buffer_reset_cpu(buffer, cpu);
@@ -1127,23 +1224,17 @@ void tracing_reset_online_cpus(struct trace_buffer *buf)
1127 ring_buffer_record_enable(buffer); 1224 ring_buffer_record_enable(buffer);
1128} 1225}
1129 1226
1130void tracing_reset_current(int cpu) 1227/* Must have trace_types_lock held */
1131{
1132 tracing_reset(&global_trace.trace_buffer, cpu);
1133}
1134
1135void tracing_reset_all_online_cpus(void) 1228void tracing_reset_all_online_cpus(void)
1136{ 1229{
1137 struct trace_array *tr; 1230 struct trace_array *tr;
1138 1231
1139 mutex_lock(&trace_types_lock);
1140 list_for_each_entry(tr, &ftrace_trace_arrays, list) { 1232 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
1141 tracing_reset_online_cpus(&tr->trace_buffer); 1233 tracing_reset_online_cpus(&tr->trace_buffer);
1142#ifdef CONFIG_TRACER_MAX_TRACE 1234#ifdef CONFIG_TRACER_MAX_TRACE
1143 tracing_reset_online_cpus(&tr->max_buffer); 1235 tracing_reset_online_cpus(&tr->max_buffer);
1144#endif 1236#endif
1145 } 1237 }
1146 mutex_unlock(&trace_types_lock);
1147} 1238}
1148 1239
1149#define SAVED_CMDLINES 128 1240#define SAVED_CMDLINES 128
@@ -1543,15 +1634,6 @@ trace_function(struct trace_array *tr,
1543 __buffer_unlock_commit(buffer, event); 1634 __buffer_unlock_commit(buffer, event);
1544} 1635}
1545 1636
1546void
1547ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1548 unsigned long ip, unsigned long parent_ip, unsigned long flags,
1549 int pc)
1550{
1551 if (likely(!atomic_read(&data->disabled)))
1552 trace_function(tr, ip, parent_ip, flags, pc);
1553}
1554
1555#ifdef CONFIG_STACKTRACE 1637#ifdef CONFIG_STACKTRACE
1556 1638
1557#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) 1639#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
@@ -2760,6 +2842,17 @@ static int s_show(struct seq_file *m, void *v)
2760 return 0; 2842 return 0;
2761} 2843}
2762 2844
2845/*
2846 * Should be used after trace_array_get(), trace_types_lock
2847 * ensures that i_cdev was already initialized.
2848 */
2849static inline int tracing_get_cpu(struct inode *inode)
2850{
2851 if (inode->i_cdev) /* See trace_create_cpu_file() */
2852 return (long)inode->i_cdev - 1;
2853 return RING_BUFFER_ALL_CPUS;
2854}
2855
2763static const struct seq_operations tracer_seq_ops = { 2856static const struct seq_operations tracer_seq_ops = {
2764 .start = s_start, 2857 .start = s_start,
2765 .next = s_next, 2858 .next = s_next,
@@ -2770,8 +2863,7 @@ static const struct seq_operations tracer_seq_ops = {
2770static struct trace_iterator * 2863static struct trace_iterator *
2771__tracing_open(struct inode *inode, struct file *file, bool snapshot) 2864__tracing_open(struct inode *inode, struct file *file, bool snapshot)
2772{ 2865{
2773 struct trace_cpu *tc = inode->i_private; 2866 struct trace_array *tr = inode->i_private;
2774 struct trace_array *tr = tc->tr;
2775 struct trace_iterator *iter; 2867 struct trace_iterator *iter;
2776 int cpu; 2868 int cpu;
2777 2869
@@ -2812,8 +2904,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
2812 iter->trace_buffer = &tr->trace_buffer; 2904 iter->trace_buffer = &tr->trace_buffer;
2813 iter->snapshot = snapshot; 2905 iter->snapshot = snapshot;
2814 iter->pos = -1; 2906 iter->pos = -1;
2907 iter->cpu_file = tracing_get_cpu(inode);
2815 mutex_init(&iter->mutex); 2908 mutex_init(&iter->mutex);
2816 iter->cpu_file = tc->cpu;
2817 2909
2818 /* Notify the tracer early; before we stop tracing. */ 2910 /* Notify the tracer early; before we stop tracing. */
2819 if (iter->trace && iter->trace->open) 2911 if (iter->trace && iter->trace->open)
@@ -2850,8 +2942,6 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
2850 tracing_iter_reset(iter, cpu); 2942 tracing_iter_reset(iter, cpu);
2851 } 2943 }
2852 2944
2853 tr->ref++;
2854
2855 mutex_unlock(&trace_types_lock); 2945 mutex_unlock(&trace_types_lock);
2856 2946
2857 return iter; 2947 return iter;
@@ -2874,24 +2964,41 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
2874 return 0; 2964 return 0;
2875} 2965}
2876 2966
2967/*
2968 * Open and update trace_array ref count.
2969 * Must have the current trace_array passed to it.
2970 */
2971static int tracing_open_generic_tr(struct inode *inode, struct file *filp)
2972{
2973 struct trace_array *tr = inode->i_private;
2974
2975 if (tracing_disabled)
2976 return -ENODEV;
2977
2978 if (trace_array_get(tr) < 0)
2979 return -ENODEV;
2980
2981 filp->private_data = inode->i_private;
2982
2983 return 0;
2984}
2985
2877static int tracing_release(struct inode *inode, struct file *file) 2986static int tracing_release(struct inode *inode, struct file *file)
2878{ 2987{
2988 struct trace_array *tr = inode->i_private;
2879 struct seq_file *m = file->private_data; 2989 struct seq_file *m = file->private_data;
2880 struct trace_iterator *iter; 2990 struct trace_iterator *iter;
2881 struct trace_array *tr;
2882 int cpu; 2991 int cpu;
2883 2992
2884 if (!(file->f_mode & FMODE_READ)) 2993 if (!(file->f_mode & FMODE_READ)) {
2994 trace_array_put(tr);
2885 return 0; 2995 return 0;
2996 }
2886 2997
2998 /* Writes do not use seq_file */
2887 iter = m->private; 2999 iter = m->private;
2888 tr = iter->tr;
2889
2890 mutex_lock(&trace_types_lock); 3000 mutex_lock(&trace_types_lock);
2891 3001
2892 WARN_ON(!tr->ref);
2893 tr->ref--;
2894
2895 for_each_tracing_cpu(cpu) { 3002 for_each_tracing_cpu(cpu) {
2896 if (iter->buffer_iter[cpu]) 3003 if (iter->buffer_iter[cpu])
2897 ring_buffer_read_finish(iter->buffer_iter[cpu]); 3004 ring_buffer_read_finish(iter->buffer_iter[cpu]);
@@ -2903,6 +3010,9 @@ static int tracing_release(struct inode *inode, struct file *file)
2903 if (!iter->snapshot) 3010 if (!iter->snapshot)
2904 /* reenable tracing if it was previously enabled */ 3011 /* reenable tracing if it was previously enabled */
2905 tracing_start_tr(tr); 3012 tracing_start_tr(tr);
3013
3014 __trace_array_put(tr);
3015
2906 mutex_unlock(&trace_types_lock); 3016 mutex_unlock(&trace_types_lock);
2907 3017
2908 mutex_destroy(&iter->mutex); 3018 mutex_destroy(&iter->mutex);
@@ -2910,24 +3020,44 @@ static int tracing_release(struct inode *inode, struct file *file)
2910 kfree(iter->trace); 3020 kfree(iter->trace);
2911 kfree(iter->buffer_iter); 3021 kfree(iter->buffer_iter);
2912 seq_release_private(inode, file); 3022 seq_release_private(inode, file);
3023
3024 return 0;
3025}
3026
3027static int tracing_release_generic_tr(struct inode *inode, struct file *file)
3028{
3029 struct trace_array *tr = inode->i_private;
3030
3031 trace_array_put(tr);
2913 return 0; 3032 return 0;
2914} 3033}
2915 3034
3035static int tracing_single_release_tr(struct inode *inode, struct file *file)
3036{
3037 struct trace_array *tr = inode->i_private;
3038
3039 trace_array_put(tr);
3040
3041 return single_release(inode, file);
3042}
3043
2916static int tracing_open(struct inode *inode, struct file *file) 3044static int tracing_open(struct inode *inode, struct file *file)
2917{ 3045{
3046 struct trace_array *tr = inode->i_private;
2918 struct trace_iterator *iter; 3047 struct trace_iterator *iter;
2919 int ret = 0; 3048 int ret = 0;
2920 3049
3050 if (trace_array_get(tr) < 0)
3051 return -ENODEV;
3052
2921 /* If this file was open for write, then erase contents */ 3053 /* If this file was open for write, then erase contents */
2922 if ((file->f_mode & FMODE_WRITE) && 3054 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
2923 (file->f_flags & O_TRUNC)) { 3055 int cpu = tracing_get_cpu(inode);
2924 struct trace_cpu *tc = inode->i_private;
2925 struct trace_array *tr = tc->tr;
2926 3056
2927 if (tc->cpu == RING_BUFFER_ALL_CPUS) 3057 if (cpu == RING_BUFFER_ALL_CPUS)
2928 tracing_reset_online_cpus(&tr->trace_buffer); 3058 tracing_reset_online_cpus(&tr->trace_buffer);
2929 else 3059 else
2930 tracing_reset(&tr->trace_buffer, tc->cpu); 3060 tracing_reset(&tr->trace_buffer, cpu);
2931 } 3061 }
2932 3062
2933 if (file->f_mode & FMODE_READ) { 3063 if (file->f_mode & FMODE_READ) {
@@ -2937,6 +3067,10 @@ static int tracing_open(struct inode *inode, struct file *file)
2937 else if (trace_flags & TRACE_ITER_LATENCY_FMT) 3067 else if (trace_flags & TRACE_ITER_LATENCY_FMT)
2938 iter->iter_flags |= TRACE_FILE_LAT_FMT; 3068 iter->iter_flags |= TRACE_FILE_LAT_FMT;
2939 } 3069 }
3070
3071 if (ret < 0)
3072 trace_array_put(tr);
3073
2940 return ret; 3074 return ret;
2941} 3075}
2942 3076
@@ -3293,17 +3427,27 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
3293 3427
3294static int tracing_trace_options_open(struct inode *inode, struct file *file) 3428static int tracing_trace_options_open(struct inode *inode, struct file *file)
3295{ 3429{
3430 struct trace_array *tr = inode->i_private;
3431 int ret;
3432
3296 if (tracing_disabled) 3433 if (tracing_disabled)
3297 return -ENODEV; 3434 return -ENODEV;
3298 3435
3299 return single_open(file, tracing_trace_options_show, inode->i_private); 3436 if (trace_array_get(tr) < 0)
3437 return -ENODEV;
3438
3439 ret = single_open(file, tracing_trace_options_show, inode->i_private);
3440 if (ret < 0)
3441 trace_array_put(tr);
3442
3443 return ret;
3300} 3444}
3301 3445
3302static const struct file_operations tracing_iter_fops = { 3446static const struct file_operations tracing_iter_fops = {
3303 .open = tracing_trace_options_open, 3447 .open = tracing_trace_options_open,
3304 .read = seq_read, 3448 .read = seq_read,
3305 .llseek = seq_lseek, 3449 .llseek = seq_lseek,
3306 .release = single_release, 3450 .release = tracing_single_release_tr,
3307 .write = tracing_trace_options_write, 3451 .write = tracing_trace_options_write,
3308}; 3452};
3309 3453
@@ -3379,14 +3523,14 @@ static const char readme_msg[] =
3379 "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" 3523 "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n"
3380 "\t\t\t Read the contents for more information\n" 3524 "\t\t\t Read the contents for more information\n"
3381#endif 3525#endif
3382#ifdef CONFIG_STACKTRACE 3526#ifdef CONFIG_STACK_TRACER
3383 " stack_trace\t\t- Shows the max stack trace when active\n" 3527 " stack_trace\t\t- Shows the max stack trace when active\n"
3384 " stack_max_size\t- Shows current max stack size that was traced\n" 3528 " stack_max_size\t- Shows current max stack size that was traced\n"
3385 "\t\t\t Write into this file to reset the max size (trigger a new trace)\n" 3529 "\t\t\t Write into this file to reset the max size (trigger a new trace)\n"
3386#ifdef CONFIG_DYNAMIC_FTRACE 3530#ifdef CONFIG_DYNAMIC_FTRACE
3387 " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" 3531 " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n"
3388#endif 3532#endif
3389#endif /* CONFIG_STACKTRACE */ 3533#endif /* CONFIG_STACK_TRACER */
3390; 3534;
3391 3535
3392static ssize_t 3536static ssize_t
@@ -3783,20 +3927,23 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
3783 3927
3784static int tracing_open_pipe(struct inode *inode, struct file *filp) 3928static int tracing_open_pipe(struct inode *inode, struct file *filp)
3785{ 3929{
3786 struct trace_cpu *tc = inode->i_private; 3930 struct trace_array *tr = inode->i_private;
3787 struct trace_array *tr = tc->tr;
3788 struct trace_iterator *iter; 3931 struct trace_iterator *iter;
3789 int ret = 0; 3932 int ret = 0;
3790 3933
3791 if (tracing_disabled) 3934 if (tracing_disabled)
3792 return -ENODEV; 3935 return -ENODEV;
3793 3936
3937 if (trace_array_get(tr) < 0)
3938 return -ENODEV;
3939
3794 mutex_lock(&trace_types_lock); 3940 mutex_lock(&trace_types_lock);
3795 3941
3796 /* create a buffer to store the information to pass to userspace */ 3942 /* create a buffer to store the information to pass to userspace */
3797 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 3943 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
3798 if (!iter) { 3944 if (!iter) {
3799 ret = -ENOMEM; 3945 ret = -ENOMEM;
3946 __trace_array_put(tr);
3800 goto out; 3947 goto out;
3801 } 3948 }
3802 3949
@@ -3826,9 +3973,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3826 if (trace_clocks[tr->clock_id].in_ns) 3973 if (trace_clocks[tr->clock_id].in_ns)
3827 iter->iter_flags |= TRACE_FILE_TIME_IN_NS; 3974 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
3828 3975
3829 iter->cpu_file = tc->cpu; 3976 iter->tr = tr;
3830 iter->tr = tc->tr; 3977 iter->trace_buffer = &tr->trace_buffer;
3831 iter->trace_buffer = &tc->tr->trace_buffer; 3978 iter->cpu_file = tracing_get_cpu(inode);
3832 mutex_init(&iter->mutex); 3979 mutex_init(&iter->mutex);
3833 filp->private_data = iter; 3980 filp->private_data = iter;
3834 3981
@@ -3843,6 +3990,7 @@ out:
3843fail: 3990fail:
3844 kfree(iter->trace); 3991 kfree(iter->trace);
3845 kfree(iter); 3992 kfree(iter);
3993 __trace_array_put(tr);
3846 mutex_unlock(&trace_types_lock); 3994 mutex_unlock(&trace_types_lock);
3847 return ret; 3995 return ret;
3848} 3996}
@@ -3850,6 +3998,7 @@ fail:
3850static int tracing_release_pipe(struct inode *inode, struct file *file) 3998static int tracing_release_pipe(struct inode *inode, struct file *file)
3851{ 3999{
3852 struct trace_iterator *iter = file->private_data; 4000 struct trace_iterator *iter = file->private_data;
4001 struct trace_array *tr = inode->i_private;
3853 4002
3854 mutex_lock(&trace_types_lock); 4003 mutex_lock(&trace_types_lock);
3855 4004
@@ -3863,6 +4012,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
3863 kfree(iter->trace); 4012 kfree(iter->trace);
3864 kfree(iter); 4013 kfree(iter);
3865 4014
4015 trace_array_put(tr);
4016
3866 return 0; 4017 return 0;
3867} 4018}
3868 4019
@@ -3939,7 +4090,7 @@ static int tracing_wait_pipe(struct file *filp)
3939 * 4090 *
3940 * iter->pos will be 0 if we haven't read anything. 4091 * iter->pos will be 0 if we haven't read anything.
3941 */ 4092 */
3942 if (!tracing_is_enabled() && iter->pos) 4093 if (!tracing_is_on() && iter->pos)
3943 break; 4094 break;
3944 } 4095 }
3945 4096
@@ -4000,6 +4151,7 @@ waitagain:
4000 memset(&iter->seq, 0, 4151 memset(&iter->seq, 0,
4001 sizeof(struct trace_iterator) - 4152 sizeof(struct trace_iterator) -
4002 offsetof(struct trace_iterator, seq)); 4153 offsetof(struct trace_iterator, seq));
4154 cpumask_clear(iter->started);
4003 iter->pos = -1; 4155 iter->pos = -1;
4004 4156
4005 trace_event_read_lock(); 4157 trace_event_read_lock();
@@ -4200,15 +4352,16 @@ static ssize_t
4200tracing_entries_read(struct file *filp, char __user *ubuf, 4352tracing_entries_read(struct file *filp, char __user *ubuf,
4201 size_t cnt, loff_t *ppos) 4353 size_t cnt, loff_t *ppos)
4202{ 4354{
4203 struct trace_cpu *tc = filp->private_data; 4355 struct inode *inode = file_inode(filp);
4204 struct trace_array *tr = tc->tr; 4356 struct trace_array *tr = inode->i_private;
4357 int cpu = tracing_get_cpu(inode);
4205 char buf[64]; 4358 char buf[64];
4206 int r = 0; 4359 int r = 0;
4207 ssize_t ret; 4360 ssize_t ret;
4208 4361
4209 mutex_lock(&trace_types_lock); 4362 mutex_lock(&trace_types_lock);
4210 4363
4211 if (tc->cpu == RING_BUFFER_ALL_CPUS) { 4364 if (cpu == RING_BUFFER_ALL_CPUS) {
4212 int cpu, buf_size_same; 4365 int cpu, buf_size_same;
4213 unsigned long size; 4366 unsigned long size;
4214 4367
@@ -4235,7 +4388,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
4235 } else 4388 } else
4236 r = sprintf(buf, "X\n"); 4389 r = sprintf(buf, "X\n");
4237 } else 4390 } else
4238 r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10); 4391 r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10);
4239 4392
4240 mutex_unlock(&trace_types_lock); 4393 mutex_unlock(&trace_types_lock);
4241 4394
@@ -4247,7 +4400,8 @@ static ssize_t
4247tracing_entries_write(struct file *filp, const char __user *ubuf, 4400tracing_entries_write(struct file *filp, const char __user *ubuf,
4248 size_t cnt, loff_t *ppos) 4401 size_t cnt, loff_t *ppos)
4249{ 4402{
4250 struct trace_cpu *tc = filp->private_data; 4403 struct inode *inode = file_inode(filp);
4404 struct trace_array *tr = inode->i_private;
4251 unsigned long val; 4405 unsigned long val;
4252 int ret; 4406 int ret;
4253 4407
@@ -4261,8 +4415,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
4261 4415
4262 /* value is in KB */ 4416 /* value is in KB */
4263 val <<= 10; 4417 val <<= 10;
4264 4418 ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode));
4265 ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu);
4266 if (ret < 0) 4419 if (ret < 0)
4267 return ret; 4420 return ret;
4268 4421
@@ -4316,10 +4469,12 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
4316 4469
4317 /* disable tracing ? */ 4470 /* disable tracing ? */
4318 if (trace_flags & TRACE_ITER_STOP_ON_FREE) 4471 if (trace_flags & TRACE_ITER_STOP_ON_FREE)
4319 tracing_off(); 4472 tracer_tracing_off(tr);
4320 /* resize the ring buffer to 0 */ 4473 /* resize the ring buffer to 0 */
4321 tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); 4474 tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
4322 4475
4476 trace_array_put(tr);
4477
4323 return 0; 4478 return 0;
4324} 4479}
4325 4480
@@ -4328,6 +4483,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
4328 size_t cnt, loff_t *fpos) 4483 size_t cnt, loff_t *fpos)
4329{ 4484{
4330 unsigned long addr = (unsigned long)ubuf; 4485 unsigned long addr = (unsigned long)ubuf;
4486 struct trace_array *tr = filp->private_data;
4331 struct ring_buffer_event *event; 4487 struct ring_buffer_event *event;
4332 struct ring_buffer *buffer; 4488 struct ring_buffer *buffer;
4333 struct print_entry *entry; 4489 struct print_entry *entry;
@@ -4387,7 +4543,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
4387 4543
4388 local_save_flags(irq_flags); 4544 local_save_flags(irq_flags);
4389 size = sizeof(*entry) + cnt + 2; /* possible \n added */ 4545 size = sizeof(*entry) + cnt + 2; /* possible \n added */
4390 buffer = global_trace.trace_buffer.buffer; 4546 buffer = tr->trace_buffer.buffer;
4391 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 4547 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
4392 irq_flags, preempt_count()); 4548 irq_flags, preempt_count());
4393 if (!event) { 4549 if (!event) {
@@ -4478,12 +4634,12 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4478 * New clock may not be consistent with the previous clock. 4634 * New clock may not be consistent with the previous clock.
4479 * Reset the buffer so that it doesn't have incomparable timestamps. 4635 * Reset the buffer so that it doesn't have incomparable timestamps.
4480 */ 4636 */
4481 tracing_reset_online_cpus(&global_trace.trace_buffer); 4637 tracing_reset_online_cpus(&tr->trace_buffer);
4482 4638
4483#ifdef CONFIG_TRACER_MAX_TRACE 4639#ifdef CONFIG_TRACER_MAX_TRACE
4484 if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) 4640 if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
4485 ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); 4641 ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
4486 tracing_reset_online_cpus(&global_trace.max_buffer); 4642 tracing_reset_online_cpus(&tr->max_buffer);
4487#endif 4643#endif
4488 4644
4489 mutex_unlock(&trace_types_lock); 4645 mutex_unlock(&trace_types_lock);
@@ -4495,10 +4651,20 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4495 4651
4496static int tracing_clock_open(struct inode *inode, struct file *file) 4652static int tracing_clock_open(struct inode *inode, struct file *file)
4497{ 4653{
4654 struct trace_array *tr = inode->i_private;
4655 int ret;
4656
4498 if (tracing_disabled) 4657 if (tracing_disabled)
4499 return -ENODEV; 4658 return -ENODEV;
4500 4659
4501 return single_open(file, tracing_clock_show, inode->i_private); 4660 if (trace_array_get(tr))
4661 return -ENODEV;
4662
4663 ret = single_open(file, tracing_clock_show, inode->i_private);
4664 if (ret < 0)
4665 trace_array_put(tr);
4666
4667 return ret;
4502} 4668}
4503 4669
4504struct ftrace_buffer_info { 4670struct ftrace_buffer_info {
@@ -4510,31 +4676,40 @@ struct ftrace_buffer_info {
4510#ifdef CONFIG_TRACER_SNAPSHOT 4676#ifdef CONFIG_TRACER_SNAPSHOT
4511static int tracing_snapshot_open(struct inode *inode, struct file *file) 4677static int tracing_snapshot_open(struct inode *inode, struct file *file)
4512{ 4678{
4513 struct trace_cpu *tc = inode->i_private; 4679 struct trace_array *tr = inode->i_private;
4514 struct trace_iterator *iter; 4680 struct trace_iterator *iter;
4515 struct seq_file *m; 4681 struct seq_file *m;
4516 int ret = 0; 4682 int ret = 0;
4517 4683
4684 if (trace_array_get(tr) < 0)
4685 return -ENODEV;
4686
4518 if (file->f_mode & FMODE_READ) { 4687 if (file->f_mode & FMODE_READ) {
4519 iter = __tracing_open(inode, file, true); 4688 iter = __tracing_open(inode, file, true);
4520 if (IS_ERR(iter)) 4689 if (IS_ERR(iter))
4521 ret = PTR_ERR(iter); 4690 ret = PTR_ERR(iter);
4522 } else { 4691 } else {
4523 /* Writes still need the seq_file to hold the private data */ 4692 /* Writes still need the seq_file to hold the private data */
4693 ret = -ENOMEM;
4524 m = kzalloc(sizeof(*m), GFP_KERNEL); 4694 m = kzalloc(sizeof(*m), GFP_KERNEL);
4525 if (!m) 4695 if (!m)
4526 return -ENOMEM; 4696 goto out;
4527 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 4697 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
4528 if (!iter) { 4698 if (!iter) {
4529 kfree(m); 4699 kfree(m);
4530 return -ENOMEM; 4700 goto out;
4531 } 4701 }
4532 iter->tr = tc->tr; 4702 ret = 0;
4533 iter->trace_buffer = &tc->tr->max_buffer; 4703
4534 iter->cpu_file = tc->cpu; 4704 iter->tr = tr;
4705 iter->trace_buffer = &tr->max_buffer;
4706 iter->cpu_file = tracing_get_cpu(inode);
4535 m->private = iter; 4707 m->private = iter;
4536 file->private_data = m; 4708 file->private_data = m;
4537 } 4709 }
4710out:
4711 if (ret < 0)
4712 trace_array_put(tr);
4538 4713
4539 return ret; 4714 return ret;
4540} 4715}
@@ -4616,9 +4791,12 @@ out:
4616static int tracing_snapshot_release(struct inode *inode, struct file *file) 4791static int tracing_snapshot_release(struct inode *inode, struct file *file)
4617{ 4792{
4618 struct seq_file *m = file->private_data; 4793 struct seq_file *m = file->private_data;
4794 int ret;
4795
4796 ret = tracing_release(inode, file);
4619 4797
4620 if (file->f_mode & FMODE_READ) 4798 if (file->f_mode & FMODE_READ)
4621 return tracing_release(inode, file); 4799 return ret;
4622 4800
4623 /* If write only, the seq_file is just a stub */ 4801 /* If write only, the seq_file is just a stub */
4624 if (m) 4802 if (m)
@@ -4684,34 +4862,38 @@ static const struct file_operations tracing_pipe_fops = {
4684}; 4862};
4685 4863
4686static const struct file_operations tracing_entries_fops = { 4864static const struct file_operations tracing_entries_fops = {
4687 .open = tracing_open_generic, 4865 .open = tracing_open_generic_tr,
4688 .read = tracing_entries_read, 4866 .read = tracing_entries_read,
4689 .write = tracing_entries_write, 4867 .write = tracing_entries_write,
4690 .llseek = generic_file_llseek, 4868 .llseek = generic_file_llseek,
4869 .release = tracing_release_generic_tr,
4691}; 4870};
4692 4871
4693static const struct file_operations tracing_total_entries_fops = { 4872static const struct file_operations tracing_total_entries_fops = {
4694 .open = tracing_open_generic, 4873 .open = tracing_open_generic_tr,
4695 .read = tracing_total_entries_read, 4874 .read = tracing_total_entries_read,
4696 .llseek = generic_file_llseek, 4875 .llseek = generic_file_llseek,
4876 .release = tracing_release_generic_tr,
4697}; 4877};
4698 4878
4699static const struct file_operations tracing_free_buffer_fops = { 4879static const struct file_operations tracing_free_buffer_fops = {
4880 .open = tracing_open_generic_tr,
4700 .write = tracing_free_buffer_write, 4881 .write = tracing_free_buffer_write,
4701 .release = tracing_free_buffer_release, 4882 .release = tracing_free_buffer_release,
4702}; 4883};
4703 4884
4704static const struct file_operations tracing_mark_fops = { 4885static const struct file_operations tracing_mark_fops = {
4705 .open = tracing_open_generic, 4886 .open = tracing_open_generic_tr,
4706 .write = tracing_mark_write, 4887 .write = tracing_mark_write,
4707 .llseek = generic_file_llseek, 4888 .llseek = generic_file_llseek,
4889 .release = tracing_release_generic_tr,
4708}; 4890};
4709 4891
4710static const struct file_operations trace_clock_fops = { 4892static const struct file_operations trace_clock_fops = {
4711 .open = tracing_clock_open, 4893 .open = tracing_clock_open,
4712 .read = seq_read, 4894 .read = seq_read,
4713 .llseek = seq_lseek, 4895 .llseek = seq_lseek,
4714 .release = single_release, 4896 .release = tracing_single_release_tr,
4715 .write = tracing_clock_write, 4897 .write = tracing_clock_write,
4716}; 4898};
4717 4899
@@ -4736,23 +4918,26 @@ static const struct file_operations snapshot_raw_fops = {
4736 4918
4737static int tracing_buffers_open(struct inode *inode, struct file *filp) 4919static int tracing_buffers_open(struct inode *inode, struct file *filp)
4738{ 4920{
4739 struct trace_cpu *tc = inode->i_private; 4921 struct trace_array *tr = inode->i_private;
4740 struct trace_array *tr = tc->tr;
4741 struct ftrace_buffer_info *info; 4922 struct ftrace_buffer_info *info;
4923 int ret;
4742 4924
4743 if (tracing_disabled) 4925 if (tracing_disabled)
4744 return -ENODEV; 4926 return -ENODEV;
4745 4927
4928 if (trace_array_get(tr) < 0)
4929 return -ENODEV;
4930
4746 info = kzalloc(sizeof(*info), GFP_KERNEL); 4931 info = kzalloc(sizeof(*info), GFP_KERNEL);
4747 if (!info) 4932 if (!info) {
4933 trace_array_put(tr);
4748 return -ENOMEM; 4934 return -ENOMEM;
4935 }
4749 4936
4750 mutex_lock(&trace_types_lock); 4937 mutex_lock(&trace_types_lock);
4751 4938
4752 tr->ref++;
4753
4754 info->iter.tr = tr; 4939 info->iter.tr = tr;
4755 info->iter.cpu_file = tc->cpu; 4940 info->iter.cpu_file = tracing_get_cpu(inode);
4756 info->iter.trace = tr->current_trace; 4941 info->iter.trace = tr->current_trace;
4757 info->iter.trace_buffer = &tr->trace_buffer; 4942 info->iter.trace_buffer = &tr->trace_buffer;
4758 info->spare = NULL; 4943 info->spare = NULL;
@@ -4763,7 +4948,11 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
4763 4948
4764 mutex_unlock(&trace_types_lock); 4949 mutex_unlock(&trace_types_lock);
4765 4950
4766 return nonseekable_open(inode, filp); 4951 ret = nonseekable_open(inode, filp);
4952 if (ret < 0)
4953 trace_array_put(tr);
4954
4955 return ret;
4767} 4956}
4768 4957
4769static unsigned int 4958static unsigned int
@@ -4863,8 +5052,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
4863 5052
4864 mutex_lock(&trace_types_lock); 5053 mutex_lock(&trace_types_lock);
4865 5054
4866 WARN_ON(!iter->tr->ref); 5055 __trace_array_put(iter->tr);
4867 iter->tr->ref--;
4868 5056
4869 if (info->spare) 5057 if (info->spare)
4870 ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); 5058 ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);
@@ -5066,14 +5254,14 @@ static ssize_t
5066tracing_stats_read(struct file *filp, char __user *ubuf, 5254tracing_stats_read(struct file *filp, char __user *ubuf,
5067 size_t count, loff_t *ppos) 5255 size_t count, loff_t *ppos)
5068{ 5256{
5069 struct trace_cpu *tc = filp->private_data; 5257 struct inode *inode = file_inode(filp);
5070 struct trace_array *tr = tc->tr; 5258 struct trace_array *tr = inode->i_private;
5071 struct trace_buffer *trace_buf = &tr->trace_buffer; 5259 struct trace_buffer *trace_buf = &tr->trace_buffer;
5260 int cpu = tracing_get_cpu(inode);
5072 struct trace_seq *s; 5261 struct trace_seq *s;
5073 unsigned long cnt; 5262 unsigned long cnt;
5074 unsigned long long t; 5263 unsigned long long t;
5075 unsigned long usec_rem; 5264 unsigned long usec_rem;
5076 int cpu = tc->cpu;
5077 5265
5078 s = kmalloc(sizeof(*s), GFP_KERNEL); 5266 s = kmalloc(sizeof(*s), GFP_KERNEL);
5079 if (!s) 5267 if (!s)
@@ -5126,9 +5314,10 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
5126} 5314}
5127 5315
5128static const struct file_operations tracing_stats_fops = { 5316static const struct file_operations tracing_stats_fops = {
5129 .open = tracing_open_generic, 5317 .open = tracing_open_generic_tr,
5130 .read = tracing_stats_read, 5318 .read = tracing_stats_read,
5131 .llseek = generic_file_llseek, 5319 .llseek = generic_file_llseek,
5320 .release = tracing_release_generic_tr,
5132}; 5321};
5133 5322
5134#ifdef CONFIG_DYNAMIC_FTRACE 5323#ifdef CONFIG_DYNAMIC_FTRACE
@@ -5317,10 +5506,20 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
5317 return tr->percpu_dir; 5506 return tr->percpu_dir;
5318} 5507}
5319 5508
5509static struct dentry *
5510trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
5511 void *data, long cpu, const struct file_operations *fops)
5512{
5513 struct dentry *ret = trace_create_file(name, mode, parent, data, fops);
5514
5515 if (ret) /* See tracing_get_cpu() */
5516 ret->d_inode->i_cdev = (void *)(cpu + 1);
5517 return ret;
5518}
5519
5320static void 5520static void
5321tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) 5521tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
5322{ 5522{
5323 struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
5324 struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); 5523 struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
5325 struct dentry *d_cpu; 5524 struct dentry *d_cpu;
5326 char cpu_dir[30]; /* 30 characters should be more than enough */ 5525 char cpu_dir[30]; /* 30 characters should be more than enough */
@@ -5336,28 +5535,28 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
5336 } 5535 }
5337 5536
5338 /* per cpu trace_pipe */ 5537 /* per cpu trace_pipe */
5339 trace_create_file("trace_pipe", 0444, d_cpu, 5538 trace_create_cpu_file("trace_pipe", 0444, d_cpu,
5340 (void *)&data->trace_cpu, &tracing_pipe_fops); 5539 tr, cpu, &tracing_pipe_fops);
5341 5540
5342 /* per cpu trace */ 5541 /* per cpu trace */
5343 trace_create_file("trace", 0644, d_cpu, 5542 trace_create_cpu_file("trace", 0644, d_cpu,
5344 (void *)&data->trace_cpu, &tracing_fops); 5543 tr, cpu, &tracing_fops);
5345 5544
5346 trace_create_file("trace_pipe_raw", 0444, d_cpu, 5545 trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu,
5347 (void *)&data->trace_cpu, &tracing_buffers_fops); 5546 tr, cpu, &tracing_buffers_fops);
5348 5547
5349 trace_create_file("stats", 0444, d_cpu, 5548 trace_create_cpu_file("stats", 0444, d_cpu,
5350 (void *)&data->trace_cpu, &tracing_stats_fops); 5549 tr, cpu, &tracing_stats_fops);
5351 5550
5352 trace_create_file("buffer_size_kb", 0444, d_cpu, 5551 trace_create_cpu_file("buffer_size_kb", 0444, d_cpu,
5353 (void *)&data->trace_cpu, &tracing_entries_fops); 5552 tr, cpu, &tracing_entries_fops);
5354 5553
5355#ifdef CONFIG_TRACER_SNAPSHOT 5554#ifdef CONFIG_TRACER_SNAPSHOT
5356 trace_create_file("snapshot", 0644, d_cpu, 5555 trace_create_cpu_file("snapshot", 0644, d_cpu,
5357 (void *)&data->trace_cpu, &snapshot_fops); 5556 tr, cpu, &snapshot_fops);
5358 5557
5359 trace_create_file("snapshot_raw", 0444, d_cpu, 5558 trace_create_cpu_file("snapshot_raw", 0444, d_cpu,
5360 (void *)&data->trace_cpu, &snapshot_raw_fops); 5559 tr, cpu, &snapshot_raw_fops);
5361#endif 5560#endif
5362} 5561}
5363 5562
@@ -5612,15 +5811,10 @@ rb_simple_read(struct file *filp, char __user *ubuf,
5612 size_t cnt, loff_t *ppos) 5811 size_t cnt, loff_t *ppos)
5613{ 5812{
5614 struct trace_array *tr = filp->private_data; 5813 struct trace_array *tr = filp->private_data;
5615 struct ring_buffer *buffer = tr->trace_buffer.buffer;
5616 char buf[64]; 5814 char buf[64];
5617 int r; 5815 int r;
5618 5816
5619 if (buffer) 5817 r = tracer_tracing_is_on(tr);
5620 r = ring_buffer_record_is_on(buffer);
5621 else
5622 r = 0;
5623
5624 r = sprintf(buf, "%d\n", r); 5818 r = sprintf(buf, "%d\n", r);
5625 5819
5626 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 5820 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -5642,11 +5836,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
5642 if (buffer) { 5836 if (buffer) {
5643 mutex_lock(&trace_types_lock); 5837 mutex_lock(&trace_types_lock);
5644 if (val) { 5838 if (val) {
5645 ring_buffer_record_on(buffer); 5839 tracer_tracing_on(tr);
5646 if (tr->current_trace->start) 5840 if (tr->current_trace->start)
5647 tr->current_trace->start(tr); 5841 tr->current_trace->start(tr);
5648 } else { 5842 } else {
5649 ring_buffer_record_off(buffer); 5843 tracer_tracing_off(tr);
5650 if (tr->current_trace->stop) 5844 if (tr->current_trace->stop)
5651 tr->current_trace->stop(tr); 5845 tr->current_trace->stop(tr);
5652 } 5846 }
@@ -5659,9 +5853,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
5659} 5853}
5660 5854
5661static const struct file_operations rb_simple_fops = { 5855static const struct file_operations rb_simple_fops = {
5662 .open = tracing_open_generic, 5856 .open = tracing_open_generic_tr,
5663 .read = rb_simple_read, 5857 .read = rb_simple_read,
5664 .write = rb_simple_write, 5858 .write = rb_simple_write,
5859 .release = tracing_release_generic_tr,
5665 .llseek = default_llseek, 5860 .llseek = default_llseek,
5666}; 5861};
5667 5862
@@ -5670,17 +5865,6 @@ struct dentry *trace_instance_dir;
5670static void 5865static void
5671init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); 5866init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer);
5672 5867
5673static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf)
5674{
5675 int cpu;
5676
5677 for_each_tracing_cpu(cpu) {
5678 memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu));
5679 per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu;
5680 per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr;
5681 }
5682}
5683
5684static int 5868static int
5685allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) 5869allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
5686{ 5870{
@@ -5698,8 +5882,6 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
5698 return -ENOMEM; 5882 return -ENOMEM;
5699 } 5883 }
5700 5884
5701 init_trace_buffers(tr, buf);
5702
5703 /* Allocate the first page for all buffers */ 5885 /* Allocate the first page for all buffers */
5704 set_buffer_entries(&tr->trace_buffer, 5886 set_buffer_entries(&tr->trace_buffer,
5705 ring_buffer_size(tr->trace_buffer.buffer, 0)); 5887 ring_buffer_size(tr->trace_buffer.buffer, 0));
@@ -5766,17 +5948,15 @@ static int new_instance_create(const char *name)
5766 if (allocate_trace_buffers(tr, trace_buf_size) < 0) 5948 if (allocate_trace_buffers(tr, trace_buf_size) < 0)
5767 goto out_free_tr; 5949 goto out_free_tr;
5768 5950
5769 /* Holder for file callbacks */
5770 tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
5771 tr->trace_cpu.tr = tr;
5772
5773 tr->dir = debugfs_create_dir(name, trace_instance_dir); 5951 tr->dir = debugfs_create_dir(name, trace_instance_dir);
5774 if (!tr->dir) 5952 if (!tr->dir)
5775 goto out_free_tr; 5953 goto out_free_tr;
5776 5954
5777 ret = event_trace_add_tracer(tr->dir, tr); 5955 ret = event_trace_add_tracer(tr->dir, tr);
5778 if (ret) 5956 if (ret) {
5957 debugfs_remove_recursive(tr->dir);
5779 goto out_free_tr; 5958 goto out_free_tr;
5959 }
5780 5960
5781 init_tracer_debugfs(tr, tr->dir); 5961 init_tracer_debugfs(tr, tr->dir);
5782 5962
@@ -5922,18 +6102,18 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
5922 tr, &tracing_iter_fops); 6102 tr, &tracing_iter_fops);
5923 6103
5924 trace_create_file("trace", 0644, d_tracer, 6104 trace_create_file("trace", 0644, d_tracer,
5925 (void *)&tr->trace_cpu, &tracing_fops); 6105 tr, &tracing_fops);
5926 6106
5927 trace_create_file("trace_pipe", 0444, d_tracer, 6107 trace_create_file("trace_pipe", 0444, d_tracer,
5928 (void *)&tr->trace_cpu, &tracing_pipe_fops); 6108 tr, &tracing_pipe_fops);
5929 6109
5930 trace_create_file("buffer_size_kb", 0644, d_tracer, 6110 trace_create_file("buffer_size_kb", 0644, d_tracer,
5931 (void *)&tr->trace_cpu, &tracing_entries_fops); 6111 tr, &tracing_entries_fops);
5932 6112
5933 trace_create_file("buffer_total_size_kb", 0444, d_tracer, 6113 trace_create_file("buffer_total_size_kb", 0444, d_tracer,
5934 tr, &tracing_total_entries_fops); 6114 tr, &tracing_total_entries_fops);
5935 6115
5936 trace_create_file("free_buffer", 0644, d_tracer, 6116 trace_create_file("free_buffer", 0200, d_tracer,
5937 tr, &tracing_free_buffer_fops); 6117 tr, &tracing_free_buffer_fops);
5938 6118
5939 trace_create_file("trace_marker", 0220, d_tracer, 6119 trace_create_file("trace_marker", 0220, d_tracer,
@@ -5943,11 +6123,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
5943 &trace_clock_fops); 6123 &trace_clock_fops);
5944 6124
5945 trace_create_file("tracing_on", 0644, d_tracer, 6125 trace_create_file("tracing_on", 0644, d_tracer,
5946 tr, &rb_simple_fops); 6126 tr, &rb_simple_fops);
5947 6127
5948#ifdef CONFIG_TRACER_SNAPSHOT 6128#ifdef CONFIG_TRACER_SNAPSHOT
5949 trace_create_file("snapshot", 0644, d_tracer, 6129 trace_create_file("snapshot", 0644, d_tracer,
5950 (void *)&tr->trace_cpu, &snapshot_fops); 6130 tr, &snapshot_fops);
5951#endif 6131#endif
5952 6132
5953 for_each_tracing_cpu(cpu) 6133 for_each_tracing_cpu(cpu)
@@ -6241,10 +6421,6 @@ __init static int tracer_alloc_buffers(void)
6241 6421
6242 global_trace.flags = TRACE_ARRAY_FL_GLOBAL; 6422 global_trace.flags = TRACE_ARRAY_FL_GLOBAL;
6243 6423
6244 /* Holder for file callbacks */
6245 global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
6246 global_trace.trace_cpu.tr = &global_trace;
6247
6248 INIT_LIST_HEAD(&global_trace.systems); 6424 INIT_LIST_HEAD(&global_trace.systems);
6249 INIT_LIST_HEAD(&global_trace.events); 6425 INIT_LIST_HEAD(&global_trace.events);
6250 list_add(&global_trace.list, &ftrace_trace_arrays); 6426 list_add(&global_trace.list, &ftrace_trace_arrays);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 20572ed88c5c..afaae41b0a02 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -130,19 +130,12 @@ enum trace_flag_type {
130 130
131struct trace_array; 131struct trace_array;
132 132
133struct trace_cpu {
134 struct trace_array *tr;
135 struct dentry *dir;
136 int cpu;
137};
138
139/* 133/*
140 * The CPU trace array - it consists of thousands of trace entries 134 * The CPU trace array - it consists of thousands of trace entries
141 * plus some other descriptor data: (for example which task started 135 * plus some other descriptor data: (for example which task started
142 * the trace, etc.) 136 * the trace, etc.)
143 */ 137 */
144struct trace_array_cpu { 138struct trace_array_cpu {
145 struct trace_cpu trace_cpu;
146 atomic_t disabled; 139 atomic_t disabled;
147 void *buffer_page; /* ring buffer spare */ 140 void *buffer_page; /* ring buffer spare */
148 141
@@ -196,7 +189,6 @@ struct trace_array {
196 bool allocated_snapshot; 189 bool allocated_snapshot;
197#endif 190#endif
198 int buffer_disabled; 191 int buffer_disabled;
199 struct trace_cpu trace_cpu; /* place holder */
200#ifdef CONFIG_FTRACE_SYSCALLS 192#ifdef CONFIG_FTRACE_SYSCALLS
201 int sys_refcount_enter; 193 int sys_refcount_enter;
202 int sys_refcount_exit; 194 int sys_refcount_exit;
@@ -214,7 +206,6 @@ struct trace_array {
214 struct dentry *event_dir; 206 struct dentry *event_dir;
215 struct list_head systems; 207 struct list_head systems;
216 struct list_head events; 208 struct list_head events;
217 struct task_struct *waiter;
218 int ref; 209 int ref;
219}; 210};
220 211
@@ -224,6 +215,11 @@ enum {
224 215
225extern struct list_head ftrace_trace_arrays; 216extern struct list_head ftrace_trace_arrays;
226 217
218extern struct mutex trace_types_lock;
219
220extern int trace_array_get(struct trace_array *tr);
221extern void trace_array_put(struct trace_array *tr);
222
227/* 223/*
228 * The global tracer (top) should be the first trace array added, 224 * The global tracer (top) should be the first trace array added,
229 * but we check the flag anyway. 225 * but we check the flag anyway.
@@ -554,11 +550,6 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu);
554 550
555void poll_wait_pipe(struct trace_iterator *iter); 551void poll_wait_pipe(struct trace_iterator *iter);
556 552
557void ftrace(struct trace_array *tr,
558 struct trace_array_cpu *data,
559 unsigned long ip,
560 unsigned long parent_ip,
561 unsigned long flags, int pc);
562void tracing_sched_switch_trace(struct trace_array *tr, 553void tracing_sched_switch_trace(struct trace_array *tr,
563 struct task_struct *prev, 554 struct task_struct *prev,
564 struct task_struct *next, 555 struct task_struct *next,
@@ -680,6 +671,15 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
680 struct trace_array *tr); 671 struct trace_array *tr);
681extern int trace_selftest_startup_branch(struct tracer *trace, 672extern int trace_selftest_startup_branch(struct tracer *trace,
682 struct trace_array *tr); 673 struct trace_array *tr);
674/*
675 * Tracer data references selftest functions that only occur
676 * on boot up. These can be __init functions. Thus, when selftests
677 * are enabled, then the tracers need to reference __init functions.
678 */
679#define __tracer_data __refdata
680#else
681/* Tracers are seldom changed. Optimize when selftests are disabled. */
682#define __tracer_data __read_mostly
683#endif /* CONFIG_FTRACE_STARTUP_TEST */ 683#endif /* CONFIG_FTRACE_STARTUP_TEST */
684 684
685extern void *head_page(struct trace_array_cpu *data); 685extern void *head_page(struct trace_array_cpu *data);
@@ -774,6 +774,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
774extern struct list_head ftrace_pids; 774extern struct list_head ftrace_pids;
775 775
776#ifdef CONFIG_FUNCTION_TRACER 776#ifdef CONFIG_FUNCTION_TRACER
777extern bool ftrace_filter_param __initdata;
777static inline int ftrace_trace_task(struct task_struct *task) 778static inline int ftrace_trace_task(struct task_struct *task)
778{ 779{
779 if (list_empty(&ftrace_pids)) 780 if (list_empty(&ftrace_pids))
@@ -899,12 +900,6 @@ static inline void trace_branch_disable(void)
899/* set ring buffers to default size if not already done so */ 900/* set ring buffers to default size if not already done so */
900int tracing_update_buffers(void); 901int tracing_update_buffers(void);
901 902
902/* trace event type bit fields, not numeric */
903enum {
904 TRACE_EVENT_TYPE_PRINTF = 1,
905 TRACE_EVENT_TYPE_RAW = 2,
906};
907
908struct ftrace_event_field { 903struct ftrace_event_field {
909 struct list_head link; 904 struct list_head link;
910 const char *name; 905 const char *name;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 84b1e045faba..80c36bcf66e8 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -236,6 +236,10 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
236 236
237 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); 237 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
238 238
239 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
240 "perf buffer not large enough"))
241 return NULL;
242
239 pc = preempt_count(); 243 pc = preempt_count();
240 244
241 *rctxp = perf_swevent_get_recursion_context(); 245 *rctxp = perf_swevent_get_recursion_context();
@@ -266,6 +270,10 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
266 struct pt_regs regs; 270 struct pt_regs regs;
267 int rctx; 271 int rctx;
268 272
273 head = this_cpu_ptr(event_function.perf_events);
274 if (hlist_empty(head))
275 return;
276
269#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ 277#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
270 sizeof(u64)) - sizeof(u32)) 278 sizeof(u64)) - sizeof(u32))
271 279
@@ -279,8 +287,6 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
279 287
280 entry->ip = ip; 288 entry->ip = ip;
281 entry->parent_ip = parent_ip; 289 entry->parent_ip = parent_ip;
282
283 head = this_cpu_ptr(event_function.perf_events);
284 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, 290 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
285 1, &regs, head, NULL); 291 1, &regs, head, NULL);
286 292
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 27963e2bf4bf..29a7ebcfb426 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -41,6 +41,23 @@ static LIST_HEAD(ftrace_common_fields);
41static struct kmem_cache *field_cachep; 41static struct kmem_cache *field_cachep;
42static struct kmem_cache *file_cachep; 42static struct kmem_cache *file_cachep;
43 43
44#define SYSTEM_FL_FREE_NAME (1 << 31)
45
46static inline int system_refcount(struct event_subsystem *system)
47{
48 return system->ref_count & ~SYSTEM_FL_FREE_NAME;
49}
50
51static int system_refcount_inc(struct event_subsystem *system)
52{
53 return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME;
54}
55
56static int system_refcount_dec(struct event_subsystem *system)
57{
58 return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME;
59}
60
44/* Double loops, do not use break, only goto's work */ 61/* Double loops, do not use break, only goto's work */
45#define do_for_each_event_file(tr, file) \ 62#define do_for_each_event_file(tr, file) \
46 list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ 63 list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
@@ -97,7 +114,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
97 114
98 field = kmem_cache_alloc(field_cachep, GFP_TRACE); 115 field = kmem_cache_alloc(field_cachep, GFP_TRACE);
99 if (!field) 116 if (!field)
100 goto err; 117 return -ENOMEM;
101 118
102 field->name = name; 119 field->name = name;
103 field->type = type; 120 field->type = type;
@@ -114,11 +131,6 @@ static int __trace_define_field(struct list_head *head, const char *type,
114 list_add(&field->link, head); 131 list_add(&field->link, head);
115 132
116 return 0; 133 return 0;
117
118err:
119 kmem_cache_free(field_cachep, field);
120
121 return -ENOMEM;
122} 134}
123 135
124int trace_define_field(struct ftrace_event_call *call, const char *type, 136int trace_define_field(struct ftrace_event_call *call, const char *type,
@@ -279,9 +291,11 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
279 } 291 }
280 call->class->reg(call, TRACE_REG_UNREGISTER, file); 292 call->class->reg(call, TRACE_REG_UNREGISTER, file);
281 } 293 }
282 /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */ 294 /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
283 if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) 295 if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
284 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); 296 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
297 else
298 clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
285 break; 299 break;
286 case 1: 300 case 1:
287 /* 301 /*
@@ -349,8 +363,8 @@ static void __put_system(struct event_subsystem *system)
349{ 363{
350 struct event_filter *filter = system->filter; 364 struct event_filter *filter = system->filter;
351 365
352 WARN_ON_ONCE(system->ref_count == 0); 366 WARN_ON_ONCE(system_refcount(system) == 0);
353 if (--system->ref_count) 367 if (system_refcount_dec(system))
354 return; 368 return;
355 369
356 list_del(&system->list); 370 list_del(&system->list);
@@ -359,13 +373,15 @@ static void __put_system(struct event_subsystem *system)
359 kfree(filter->filter_string); 373 kfree(filter->filter_string);
360 kfree(filter); 374 kfree(filter);
361 } 375 }
376 if (system->ref_count & SYSTEM_FL_FREE_NAME)
377 kfree(system->name);
362 kfree(system); 378 kfree(system);
363} 379}
364 380
365static void __get_system(struct event_subsystem *system) 381static void __get_system(struct event_subsystem *system)
366{ 382{
367 WARN_ON_ONCE(system->ref_count == 0); 383 WARN_ON_ONCE(system_refcount(system) == 0);
368 system->ref_count++; 384 system_refcount_inc(system);
369} 385}
370 386
371static void __get_system_dir(struct ftrace_subsystem_dir *dir) 387static void __get_system_dir(struct ftrace_subsystem_dir *dir)
@@ -379,7 +395,7 @@ static void __put_system_dir(struct ftrace_subsystem_dir *dir)
379{ 395{
380 WARN_ON_ONCE(dir->ref_count == 0); 396 WARN_ON_ONCE(dir->ref_count == 0);
381 /* If the subsystem is about to be freed, the dir must be too */ 397 /* If the subsystem is about to be freed, the dir must be too */
382 WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1); 398 WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1);
383 399
384 __put_system(dir->subsystem); 400 __put_system(dir->subsystem);
385 if (!--dir->ref_count) 401 if (!--dir->ref_count)
@@ -393,17 +409,55 @@ static void put_system(struct ftrace_subsystem_dir *dir)
393 mutex_unlock(&event_mutex); 409 mutex_unlock(&event_mutex);
394} 410}
395 411
412static void remove_subsystem(struct ftrace_subsystem_dir *dir)
413{
414 if (!dir)
415 return;
416
417 if (!--dir->nr_events) {
418 debugfs_remove_recursive(dir->entry);
419 list_del(&dir->list);
420 __put_system_dir(dir);
421 }
422}
423
424static void *event_file_data(struct file *filp)
425{
426 return ACCESS_ONCE(file_inode(filp)->i_private);
427}
428
429static void remove_event_file_dir(struct ftrace_event_file *file)
430{
431 struct dentry *dir = file->dir;
432 struct dentry *child;
433
434 if (dir) {
435 spin_lock(&dir->d_lock); /* probably unneeded */
436 list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) {
437 if (child->d_inode) /* probably unneeded */
438 child->d_inode->i_private = NULL;
439 }
440 spin_unlock(&dir->d_lock);
441
442 debugfs_remove_recursive(dir);
443 }
444
445 list_del(&file->list);
446 remove_subsystem(file->system);
447 kmem_cache_free(file_cachep, file);
448}
449
396/* 450/*
397 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. 451 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
398 */ 452 */
399static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, 453static int
400 const char *sub, const char *event, int set) 454__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
455 const char *sub, const char *event, int set)
401{ 456{
402 struct ftrace_event_file *file; 457 struct ftrace_event_file *file;
403 struct ftrace_event_call *call; 458 struct ftrace_event_call *call;
404 int ret = -EINVAL; 459 int ret = -EINVAL;
405 460
406 mutex_lock(&event_mutex);
407 list_for_each_entry(file, &tr->events, list) { 461 list_for_each_entry(file, &tr->events, list) {
408 462
409 call = file->event_call; 463 call = file->event_call;
@@ -429,6 +483,17 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
429 483
430 ret = 0; 484 ret = 0;
431 } 485 }
486
487 return ret;
488}
489
490static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
491 const char *sub, const char *event, int set)
492{
493 int ret;
494
495 mutex_lock(&event_mutex);
496 ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set);
432 mutex_unlock(&event_mutex); 497 mutex_unlock(&event_mutex);
433 498
434 return ret; 499 return ret;
@@ -623,18 +688,28 @@ static ssize_t
623event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, 688event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
624 loff_t *ppos) 689 loff_t *ppos)
625{ 690{
626 struct ftrace_event_file *file = filp->private_data; 691 struct ftrace_event_file *file;
627 char *buf; 692 unsigned long flags;
693 char buf[4] = "0";
628 694
629 if (file->flags & FTRACE_EVENT_FL_ENABLED) { 695 mutex_lock(&event_mutex);
630 if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED) 696 file = event_file_data(filp);
631 buf = "0*\n"; 697 if (likely(file))
632 else if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) 698 flags = file->flags;
633 buf = "1*\n"; 699 mutex_unlock(&event_mutex);
634 else 700
635 buf = "1\n"; 701 if (!file)
636 } else 702 return -ENODEV;
637 buf = "0\n"; 703
704 if (flags & FTRACE_EVENT_FL_ENABLED &&
705 !(flags & FTRACE_EVENT_FL_SOFT_DISABLED))
706 strcpy(buf, "1");
707
708 if (flags & FTRACE_EVENT_FL_SOFT_DISABLED ||
709 flags & FTRACE_EVENT_FL_SOFT_MODE)
710 strcat(buf, "*");
711
712 strcat(buf, "\n");
638 713
639 return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf)); 714 return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));
640} 715}
@@ -643,13 +718,10 @@ static ssize_t
643event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, 718event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
644 loff_t *ppos) 719 loff_t *ppos)
645{ 720{
646 struct ftrace_event_file *file = filp->private_data; 721 struct ftrace_event_file *file;
647 unsigned long val; 722 unsigned long val;
648 int ret; 723 int ret;
649 724
650 if (!file)
651 return -EINVAL;
652
653 ret = kstrtoul_from_user(ubuf, cnt, 10, &val); 725 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
654 if (ret) 726 if (ret)
655 return ret; 727 return ret;
@@ -661,8 +733,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
661 switch (val) { 733 switch (val) {
662 case 0: 734 case 0:
663 case 1: 735 case 1:
736 ret = -ENODEV;
664 mutex_lock(&event_mutex); 737 mutex_lock(&event_mutex);
665 ret = ftrace_event_enable_disable(file, val); 738 file = event_file_data(filp);
739 if (likely(file))
740 ret = ftrace_event_enable_disable(file, val);
666 mutex_unlock(&event_mutex); 741 mutex_unlock(&event_mutex);
667 break; 742 break;
668 743
@@ -769,65 +844,39 @@ enum {
769 844
770static void *f_next(struct seq_file *m, void *v, loff_t *pos) 845static void *f_next(struct seq_file *m, void *v, loff_t *pos)
771{ 846{
772 struct ftrace_event_call *call = m->private; 847 struct ftrace_event_call *call = event_file_data(m->private);
773 struct ftrace_event_field *field;
774 struct list_head *common_head = &ftrace_common_fields; 848 struct list_head *common_head = &ftrace_common_fields;
775 struct list_head *head = trace_get_fields(call); 849 struct list_head *head = trace_get_fields(call);
850 struct list_head *node = v;
776 851
777 (*pos)++; 852 (*pos)++;
778 853
779 switch ((unsigned long)v) { 854 switch ((unsigned long)v) {
780 case FORMAT_HEADER: 855 case FORMAT_HEADER:
781 if (unlikely(list_empty(common_head))) 856 node = common_head;
782 return NULL; 857 break;
783
784 field = list_entry(common_head->prev,
785 struct ftrace_event_field, link);
786 return field;
787 858
788 case FORMAT_FIELD_SEPERATOR: 859 case FORMAT_FIELD_SEPERATOR:
789 if (unlikely(list_empty(head))) 860 node = head;
790 return NULL; 861 break;
791
792 field = list_entry(head->prev, struct ftrace_event_field, link);
793 return field;
794 862
795 case FORMAT_PRINTFMT: 863 case FORMAT_PRINTFMT:
796 /* all done */ 864 /* all done */
797 return NULL; 865 return NULL;
798 } 866 }
799 867
800 field = v; 868 node = node->prev;
801 if (field->link.prev == common_head) 869 if (node == common_head)
802 return (void *)FORMAT_FIELD_SEPERATOR; 870 return (void *)FORMAT_FIELD_SEPERATOR;
803 else if (field->link.prev == head) 871 else if (node == head)
804 return (void *)FORMAT_PRINTFMT; 872 return (void *)FORMAT_PRINTFMT;
805 873 else
806 field = list_entry(field->link.prev, struct ftrace_event_field, link); 874 return node;
807
808 return field;
809}
810
811static void *f_start(struct seq_file *m, loff_t *pos)
812{
813 loff_t l = 0;
814 void *p;
815
816 /* Start by showing the header */
817 if (!*pos)
818 return (void *)FORMAT_HEADER;
819
820 p = (void *)FORMAT_HEADER;
821 do {
822 p = f_next(m, p, &l);
823 } while (p && l < *pos);
824
825 return p;
826} 875}
827 876
828static int f_show(struct seq_file *m, void *v) 877static int f_show(struct seq_file *m, void *v)
829{ 878{
830 struct ftrace_event_call *call = m->private; 879 struct ftrace_event_call *call = event_file_data(m->private);
831 struct ftrace_event_field *field; 880 struct ftrace_event_field *field;
832 const char *array_descriptor; 881 const char *array_descriptor;
833 882
@@ -848,8 +897,7 @@ static int f_show(struct seq_file *m, void *v)
848 return 0; 897 return 0;
849 } 898 }
850 899
851 field = v; 900 field = list_entry(v, struct ftrace_event_field, link);
852
853 /* 901 /*
854 * Smartly shows the array type(except dynamic array). 902 * Smartly shows the array type(except dynamic array).
855 * Normal: 903 * Normal:
@@ -876,8 +924,25 @@ static int f_show(struct seq_file *m, void *v)
876 return 0; 924 return 0;
877} 925}
878 926
927static void *f_start(struct seq_file *m, loff_t *pos)
928{
929 void *p = (void *)FORMAT_HEADER;
930 loff_t l = 0;
931
932 /* ->stop() is called even if ->start() fails */
933 mutex_lock(&event_mutex);
934 if (!event_file_data(m->private))
935 return ERR_PTR(-ENODEV);
936
937 while (l < *pos && p)
938 p = f_next(m, p, &l);
939
940 return p;
941}
942
879static void f_stop(struct seq_file *m, void *p) 943static void f_stop(struct seq_file *m, void *p)
880{ 944{
945 mutex_unlock(&event_mutex);
881} 946}
882 947
883static const struct seq_operations trace_format_seq_ops = { 948static const struct seq_operations trace_format_seq_ops = {
@@ -889,7 +954,6 @@ static const struct seq_operations trace_format_seq_ops = {
889 954
890static int trace_format_open(struct inode *inode, struct file *file) 955static int trace_format_open(struct inode *inode, struct file *file)
891{ 956{
892 struct ftrace_event_call *call = inode->i_private;
893 struct seq_file *m; 957 struct seq_file *m;
894 int ret; 958 int ret;
895 959
@@ -898,7 +962,7 @@ static int trace_format_open(struct inode *inode, struct file *file)
898 return ret; 962 return ret;
899 963
900 m = file->private_data; 964 m = file->private_data;
901 m->private = call; 965 m->private = file;
902 966
903 return 0; 967 return 0;
904} 968}
@@ -906,45 +970,47 @@ static int trace_format_open(struct inode *inode, struct file *file)
906static ssize_t 970static ssize_t
907event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) 971event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
908{ 972{
909 struct ftrace_event_call *call = filp->private_data; 973 int id = (long)event_file_data(filp);
910 struct trace_seq *s; 974 char buf[32];
911 int r; 975 int len;
912 976
913 if (*ppos) 977 if (*ppos)
914 return 0; 978 return 0;
915 979
916 s = kmalloc(sizeof(*s), GFP_KERNEL); 980 if (unlikely(!id))
917 if (!s) 981 return -ENODEV;
918 return -ENOMEM;
919 982
920 trace_seq_init(s); 983 len = sprintf(buf, "%d\n", id);
921 trace_seq_printf(s, "%d\n", call->event.type);
922 984
923 r = simple_read_from_buffer(ubuf, cnt, ppos, 985 return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
924 s->buffer, s->len);
925 kfree(s);
926 return r;
927} 986}
928 987
929static ssize_t 988static ssize_t
930event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, 989event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
931 loff_t *ppos) 990 loff_t *ppos)
932{ 991{
933 struct ftrace_event_call *call = filp->private_data; 992 struct ftrace_event_call *call;
934 struct trace_seq *s; 993 struct trace_seq *s;
935 int r; 994 int r = -ENODEV;
936 995
937 if (*ppos) 996 if (*ppos)
938 return 0; 997 return 0;
939 998
940 s = kmalloc(sizeof(*s), GFP_KERNEL); 999 s = kmalloc(sizeof(*s), GFP_KERNEL);
1000
941 if (!s) 1001 if (!s)
942 return -ENOMEM; 1002 return -ENOMEM;
943 1003
944 trace_seq_init(s); 1004 trace_seq_init(s);
945 1005
946 print_event_filter(call, s); 1006 mutex_lock(&event_mutex);
947 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 1007 call = event_file_data(filp);
1008 if (call)
1009 print_event_filter(call, s);
1010 mutex_unlock(&event_mutex);
1011
1012 if (call)
1013 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
948 1014
949 kfree(s); 1015 kfree(s);
950 1016
@@ -955,9 +1021,9 @@ static ssize_t
955event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, 1021event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
956 loff_t *ppos) 1022 loff_t *ppos)
957{ 1023{
958 struct ftrace_event_call *call = filp->private_data; 1024 struct ftrace_event_call *call;
959 char *buf; 1025 char *buf;
960 int err; 1026 int err = -ENODEV;
961 1027
962 if (cnt >= PAGE_SIZE) 1028 if (cnt >= PAGE_SIZE)
963 return -EINVAL; 1029 return -EINVAL;
@@ -972,7 +1038,12 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
972 } 1038 }
973 buf[cnt] = '\0'; 1039 buf[cnt] = '\0';
974 1040
975 err = apply_event_filter(call, buf); 1041 mutex_lock(&event_mutex);
1042 call = event_file_data(filp);
1043 if (call)
1044 err = apply_event_filter(call, buf);
1045 mutex_unlock(&event_mutex);
1046
976 free_page((unsigned long) buf); 1047 free_page((unsigned long) buf);
977 if (err < 0) 1048 if (err < 0)
978 return err; 1049 return err;
@@ -992,6 +1063,7 @@ static int subsystem_open(struct inode *inode, struct file *filp)
992 int ret; 1063 int ret;
993 1064
994 /* Make sure the system still exists */ 1065 /* Make sure the system still exists */
1066 mutex_lock(&trace_types_lock);
995 mutex_lock(&event_mutex); 1067 mutex_lock(&event_mutex);
996 list_for_each_entry(tr, &ftrace_trace_arrays, list) { 1068 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
997 list_for_each_entry(dir, &tr->systems, list) { 1069 list_for_each_entry(dir, &tr->systems, list) {
@@ -1007,6 +1079,7 @@ static int subsystem_open(struct inode *inode, struct file *filp)
1007 } 1079 }
1008 exit_loop: 1080 exit_loop:
1009 mutex_unlock(&event_mutex); 1081 mutex_unlock(&event_mutex);
1082 mutex_unlock(&trace_types_lock);
1010 1083
1011 if (!system) 1084 if (!system)
1012 return -ENODEV; 1085 return -ENODEV;
@@ -1014,9 +1087,17 @@ static int subsystem_open(struct inode *inode, struct file *filp)
1014 /* Some versions of gcc think dir can be uninitialized here */ 1087 /* Some versions of gcc think dir can be uninitialized here */
1015 WARN_ON(!dir); 1088 WARN_ON(!dir);
1016 1089
1090 /* Still need to increment the ref count of the system */
1091 if (trace_array_get(tr) < 0) {
1092 put_system(dir);
1093 return -ENODEV;
1094 }
1095
1017 ret = tracing_open_generic(inode, filp); 1096 ret = tracing_open_generic(inode, filp);
1018 if (ret < 0) 1097 if (ret < 0) {
1098 trace_array_put(tr);
1019 put_system(dir); 1099 put_system(dir);
1100 }
1020 1101
1021 return ret; 1102 return ret;
1022} 1103}
@@ -1027,16 +1108,23 @@ static int system_tr_open(struct inode *inode, struct file *filp)
1027 struct trace_array *tr = inode->i_private; 1108 struct trace_array *tr = inode->i_private;
1028 int ret; 1109 int ret;
1029 1110
1111 if (trace_array_get(tr) < 0)
1112 return -ENODEV;
1113
1030 /* Make a temporary dir that has no system but points to tr */ 1114 /* Make a temporary dir that has no system but points to tr */
1031 dir = kzalloc(sizeof(*dir), GFP_KERNEL); 1115 dir = kzalloc(sizeof(*dir), GFP_KERNEL);
1032 if (!dir) 1116 if (!dir) {
1117 trace_array_put(tr);
1033 return -ENOMEM; 1118 return -ENOMEM;
1119 }
1034 1120
1035 dir->tr = tr; 1121 dir->tr = tr;
1036 1122
1037 ret = tracing_open_generic(inode, filp); 1123 ret = tracing_open_generic(inode, filp);
1038 if (ret < 0) 1124 if (ret < 0) {
1125 trace_array_put(tr);
1039 kfree(dir); 1126 kfree(dir);
1127 }
1040 1128
1041 filp->private_data = dir; 1129 filp->private_data = dir;
1042 1130
@@ -1047,6 +1135,8 @@ static int subsystem_release(struct inode *inode, struct file *file)
1047{ 1135{
1048 struct ftrace_subsystem_dir *dir = file->private_data; 1136 struct ftrace_subsystem_dir *dir = file->private_data;
1049 1137
1138 trace_array_put(dir->tr);
1139
1050 /* 1140 /*
1051 * If dir->subsystem is NULL, then this is a temporary 1141 * If dir->subsystem is NULL, then this is a temporary
1052 * descriptor that was made for a trace_array to enable 1142 * descriptor that was made for a trace_array to enable
@@ -1143,6 +1233,7 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
1143 1233
1144static int ftrace_event_avail_open(struct inode *inode, struct file *file); 1234static int ftrace_event_avail_open(struct inode *inode, struct file *file);
1145static int ftrace_event_set_open(struct inode *inode, struct file *file); 1235static int ftrace_event_set_open(struct inode *inode, struct file *file);
1236static int ftrace_event_release(struct inode *inode, struct file *file);
1146 1237
1147static const struct seq_operations show_event_seq_ops = { 1238static const struct seq_operations show_event_seq_ops = {
1148 .start = t_start, 1239 .start = t_start,
@@ -1170,7 +1261,7 @@ static const struct file_operations ftrace_set_event_fops = {
1170 .read = seq_read, 1261 .read = seq_read,
1171 .write = ftrace_event_write, 1262 .write = ftrace_event_write,
1172 .llseek = seq_lseek, 1263 .llseek = seq_lseek,
1173 .release = seq_release, 1264 .release = ftrace_event_release,
1174}; 1265};
1175 1266
1176static const struct file_operations ftrace_enable_fops = { 1267static const struct file_operations ftrace_enable_fops = {
@@ -1188,7 +1279,6 @@ static const struct file_operations ftrace_event_format_fops = {
1188}; 1279};
1189 1280
1190static const struct file_operations ftrace_event_id_fops = { 1281static const struct file_operations ftrace_event_id_fops = {
1191 .open = tracing_open_generic,
1192 .read = event_id_read, 1282 .read = event_id_read,
1193 .llseek = default_llseek, 1283 .llseek = default_llseek,
1194}; 1284};
@@ -1247,6 +1337,15 @@ ftrace_event_open(struct inode *inode, struct file *file,
1247 return ret; 1337 return ret;
1248} 1338}
1249 1339
1340static int ftrace_event_release(struct inode *inode, struct file *file)
1341{
1342 struct trace_array *tr = inode->i_private;
1343
1344 trace_array_put(tr);
1345
1346 return seq_release(inode, file);
1347}
1348
1250static int 1349static int
1251ftrace_event_avail_open(struct inode *inode, struct file *file) 1350ftrace_event_avail_open(struct inode *inode, struct file *file)
1252{ 1351{
@@ -1260,12 +1359,19 @@ ftrace_event_set_open(struct inode *inode, struct file *file)
1260{ 1359{
1261 const struct seq_operations *seq_ops = &show_set_event_seq_ops; 1360 const struct seq_operations *seq_ops = &show_set_event_seq_ops;
1262 struct trace_array *tr = inode->i_private; 1361 struct trace_array *tr = inode->i_private;
1362 int ret;
1363
1364 if (trace_array_get(tr) < 0)
1365 return -ENODEV;
1263 1366
1264 if ((file->f_mode & FMODE_WRITE) && 1367 if ((file->f_mode & FMODE_WRITE) &&
1265 (file->f_flags & O_TRUNC)) 1368 (file->f_flags & O_TRUNC))
1266 ftrace_clear_events(tr); 1369 ftrace_clear_events(tr);
1267 1370
1268 return ftrace_event_open(inode, file, seq_ops); 1371 ret = ftrace_event_open(inode, file, seq_ops);
1372 if (ret < 0)
1373 trace_array_put(tr);
1374 return ret;
1269} 1375}
1270 1376
1271static struct event_subsystem * 1377static struct event_subsystem *
@@ -1279,7 +1385,15 @@ create_new_subsystem(const char *name)
1279 return NULL; 1385 return NULL;
1280 1386
1281 system->ref_count = 1; 1387 system->ref_count = 1;
1282 system->name = name; 1388
1389 /* Only allocate if dynamic (kprobes and modules) */
1390 if (!core_kernel_data((unsigned long)name)) {
1391 system->ref_count |= SYSTEM_FL_FREE_NAME;
1392 system->name = kstrdup(name, GFP_KERNEL);
1393 if (!system->name)
1394 goto out_free;
1395 } else
1396 system->name = name;
1283 1397
1284 system->filter = NULL; 1398 system->filter = NULL;
1285 1399
@@ -1292,6 +1406,8 @@ create_new_subsystem(const char *name)
1292 return system; 1406 return system;
1293 1407
1294 out_free: 1408 out_free:
1409 if (system->ref_count & SYSTEM_FL_FREE_NAME)
1410 kfree(system->name);
1295 kfree(system); 1411 kfree(system);
1296 return NULL; 1412 return NULL;
1297} 1413}
@@ -1410,8 +1526,8 @@ event_create_dir(struct dentry *parent,
1410 1526
1411#ifdef CONFIG_PERF_EVENTS 1527#ifdef CONFIG_PERF_EVENTS
1412 if (call->event.type && call->class->reg) 1528 if (call->event.type && call->class->reg)
1413 trace_create_file("id", 0444, file->dir, call, 1529 trace_create_file("id", 0444, file->dir,
1414 id); 1530 (void *)(long)call->event.type, id);
1415#endif 1531#endif
1416 1532
1417 /* 1533 /*
@@ -1436,33 +1552,16 @@ event_create_dir(struct dentry *parent,
1436 return 0; 1552 return 0;
1437} 1553}
1438 1554
1439static void remove_subsystem(struct ftrace_subsystem_dir *dir)
1440{
1441 if (!dir)
1442 return;
1443
1444 if (!--dir->nr_events) {
1445 debugfs_remove_recursive(dir->entry);
1446 list_del(&dir->list);
1447 __put_system_dir(dir);
1448 }
1449}
1450
1451static void remove_event_from_tracers(struct ftrace_event_call *call) 1555static void remove_event_from_tracers(struct ftrace_event_call *call)
1452{ 1556{
1453 struct ftrace_event_file *file; 1557 struct ftrace_event_file *file;
1454 struct trace_array *tr; 1558 struct trace_array *tr;
1455 1559
1456 do_for_each_event_file_safe(tr, file) { 1560 do_for_each_event_file_safe(tr, file) {
1457
1458 if (file->event_call != call) 1561 if (file->event_call != call)
1459 continue; 1562 continue;
1460 1563
1461 list_del(&file->list); 1564 remove_event_file_dir(file);
1462 debugfs_remove_recursive(file->dir);
1463 remove_subsystem(file->system);
1464 kmem_cache_free(file_cachep, file);
1465
1466 /* 1565 /*
1467 * The do_for_each_event_file_safe() is 1566 * The do_for_each_event_file_safe() is
1468 * a double loop. After finding the call for this 1567 * a double loop. After finding the call for this
@@ -1591,6 +1690,7 @@ static void __add_event_to_tracers(struct ftrace_event_call *call,
1591int trace_add_event_call(struct ftrace_event_call *call) 1690int trace_add_event_call(struct ftrace_event_call *call)
1592{ 1691{
1593 int ret; 1692 int ret;
1693 mutex_lock(&trace_types_lock);
1594 mutex_lock(&event_mutex); 1694 mutex_lock(&event_mutex);
1595 1695
1596 ret = __register_event(call, NULL); 1696 ret = __register_event(call, NULL);
@@ -1598,11 +1698,13 @@ int trace_add_event_call(struct ftrace_event_call *call)
1598 __add_event_to_tracers(call, NULL); 1698 __add_event_to_tracers(call, NULL);
1599 1699
1600 mutex_unlock(&event_mutex); 1700 mutex_unlock(&event_mutex);
1701 mutex_unlock(&trace_types_lock);
1601 return ret; 1702 return ret;
1602} 1703}
1603 1704
1604/* 1705/*
1605 * Must be called under locking both of event_mutex and trace_event_sem. 1706 * Must be called under locking of trace_types_lock, event_mutex and
1707 * trace_event_sem.
1606 */ 1708 */
1607static void __trace_remove_event_call(struct ftrace_event_call *call) 1709static void __trace_remove_event_call(struct ftrace_event_call *call)
1608{ 1710{
@@ -1611,14 +1713,53 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
1611 destroy_preds(call); 1713 destroy_preds(call);
1612} 1714}
1613 1715
1716static int probe_remove_event_call(struct ftrace_event_call *call)
1717{
1718 struct trace_array *tr;
1719 struct ftrace_event_file *file;
1720
1721#ifdef CONFIG_PERF_EVENTS
1722 if (call->perf_refcount)
1723 return -EBUSY;
1724#endif
1725 do_for_each_event_file(tr, file) {
1726 if (file->event_call != call)
1727 continue;
1728 /*
1729 * We can't rely on ftrace_event_enable_disable(enable => 0)
1730 * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress
1731 * TRACE_REG_UNREGISTER.
1732 */
1733 if (file->flags & FTRACE_EVENT_FL_ENABLED)
1734 return -EBUSY;
1735 /*
1736 * The do_for_each_event_file_safe() is
1737 * a double loop. After finding the call for this
1738 * trace_array, we use break to jump to the next
1739 * trace_array.
1740 */
1741 break;
1742 } while_for_each_event_file();
1743
1744 __trace_remove_event_call(call);
1745
1746 return 0;
1747}
1748
1614/* Remove an event_call */ 1749/* Remove an event_call */
1615void trace_remove_event_call(struct ftrace_event_call *call) 1750int trace_remove_event_call(struct ftrace_event_call *call)
1616{ 1751{
1752 int ret;
1753
1754 mutex_lock(&trace_types_lock);
1617 mutex_lock(&event_mutex); 1755 mutex_lock(&event_mutex);
1618 down_write(&trace_event_sem); 1756 down_write(&trace_event_sem);
1619 __trace_remove_event_call(call); 1757 ret = probe_remove_event_call(call);
1620 up_write(&trace_event_sem); 1758 up_write(&trace_event_sem);
1621 mutex_unlock(&event_mutex); 1759 mutex_unlock(&event_mutex);
1760 mutex_unlock(&trace_types_lock);
1761
1762 return ret;
1622} 1763}
1623 1764
1624#define for_each_event(event, start, end) \ 1765#define for_each_event(event, start, end) \
@@ -1762,6 +1903,7 @@ static int trace_module_notify(struct notifier_block *self,
1762{ 1903{
1763 struct module *mod = data; 1904 struct module *mod = data;
1764 1905
1906 mutex_lock(&trace_types_lock);
1765 mutex_lock(&event_mutex); 1907 mutex_lock(&event_mutex);
1766 switch (val) { 1908 switch (val) {
1767 case MODULE_STATE_COMING: 1909 case MODULE_STATE_COMING:
@@ -1772,6 +1914,7 @@ static int trace_module_notify(struct notifier_block *self,
1772 break; 1914 break;
1773 } 1915 }
1774 mutex_unlock(&event_mutex); 1916 mutex_unlock(&event_mutex);
1917 mutex_unlock(&trace_types_lock);
1775 1918
1776 return 0; 1919 return 0;
1777} 1920}
@@ -2011,10 +2154,7 @@ event_enable_func(struct ftrace_hash *hash,
2011 int ret; 2154 int ret;
2012 2155
2013 /* hash funcs only work with set_ftrace_filter */ 2156 /* hash funcs only work with set_ftrace_filter */
2014 if (!enabled) 2157 if (!enabled || !param)
2015 return -EINVAL;
2016
2017 if (!param)
2018 return -EINVAL; 2158 return -EINVAL;
2019 2159
2020 system = strsep(&param, ":"); 2160 system = strsep(&param, ":");
@@ -2188,12 +2328,8 @@ __trace_remove_event_dirs(struct trace_array *tr)
2188{ 2328{
2189 struct ftrace_event_file *file, *next; 2329 struct ftrace_event_file *file, *next;
2190 2330
2191 list_for_each_entry_safe(file, next, &tr->events, list) { 2331 list_for_each_entry_safe(file, next, &tr->events, list)
2192 list_del(&file->list); 2332 remove_event_file_dir(file);
2193 debugfs_remove_recursive(file->dir);
2194 remove_subsystem(file->system);
2195 kmem_cache_free(file_cachep, file);
2196 }
2197} 2333}
2198 2334
2199static void 2335static void
@@ -2329,11 +2465,11 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
2329 2465
2330int event_trace_del_tracer(struct trace_array *tr) 2466int event_trace_del_tracer(struct trace_array *tr)
2331{ 2467{
2332 /* Disable any running events */
2333 __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
2334
2335 mutex_lock(&event_mutex); 2468 mutex_lock(&event_mutex);
2336 2469
2470 /* Disable any running events */
2471 __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
2472
2337 down_write(&trace_event_sem); 2473 down_write(&trace_event_sem);
2338 __trace_remove_event_dirs(tr); 2474 __trace_remove_event_dirs(tr);
2339 debugfs_remove_recursive(tr->event_dir); 2475 debugfs_remove_recursive(tr->event_dir);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e1b653f7e1ca..97daa8cf958d 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -44,6 +44,7 @@ enum filter_op_ids
44 OP_LE, 44 OP_LE,
45 OP_GT, 45 OP_GT,
46 OP_GE, 46 OP_GE,
47 OP_BAND,
47 OP_NONE, 48 OP_NONE,
48 OP_OPEN_PAREN, 49 OP_OPEN_PAREN,
49}; 50};
@@ -54,6 +55,7 @@ struct filter_op {
54 int precedence; 55 int precedence;
55}; 56};
56 57
58/* Order must be the same as enum filter_op_ids above */
57static struct filter_op filter_ops[] = { 59static struct filter_op filter_ops[] = {
58 { OP_OR, "||", 1 }, 60 { OP_OR, "||", 1 },
59 { OP_AND, "&&", 2 }, 61 { OP_AND, "&&", 2 },
@@ -64,6 +66,7 @@ static struct filter_op filter_ops[] = {
64 { OP_LE, "<=", 5 }, 66 { OP_LE, "<=", 5 },
65 { OP_GT, ">", 5 }, 67 { OP_GT, ">", 5 },
66 { OP_GE, ">=", 5 }, 68 { OP_GE, ">=", 5 },
69 { OP_BAND, "&", 6 },
67 { OP_NONE, "OP_NONE", 0 }, 70 { OP_NONE, "OP_NONE", 0 },
68 { OP_OPEN_PAREN, "(", 0 }, 71 { OP_OPEN_PAREN, "(", 0 },
69}; 72};
@@ -156,6 +159,9 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \
156 case OP_GE: \ 159 case OP_GE: \
157 match = (*addr >= val); \ 160 match = (*addr >= val); \
158 break; \ 161 break; \
162 case OP_BAND: \
163 match = (*addr & val); \
164 break; \
159 default: \ 165 default: \
160 break; \ 166 break; \
161 } \ 167 } \
@@ -631,17 +637,15 @@ static void append_filter_err(struct filter_parse_state *ps,
631 free_page((unsigned long) buf); 637 free_page((unsigned long) buf);
632} 638}
633 639
640/* caller must hold event_mutex */
634void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) 641void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
635{ 642{
636 struct event_filter *filter; 643 struct event_filter *filter = call->filter;
637 644
638 mutex_lock(&event_mutex);
639 filter = call->filter;
640 if (filter && filter->filter_string) 645 if (filter && filter->filter_string)
641 trace_seq_printf(s, "%s\n", filter->filter_string); 646 trace_seq_printf(s, "%s\n", filter->filter_string);
642 else 647 else
643 trace_seq_printf(s, "none\n"); 648 trace_seq_puts(s, "none\n");
644 mutex_unlock(&event_mutex);
645} 649}
646 650
647void print_subsystem_event_filter(struct event_subsystem *system, 651void print_subsystem_event_filter(struct event_subsystem *system,
@@ -654,7 +658,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
654 if (filter && filter->filter_string) 658 if (filter && filter->filter_string)
655 trace_seq_printf(s, "%s\n", filter->filter_string); 659 trace_seq_printf(s, "%s\n", filter->filter_string);
656 else 660 else
657 trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); 661 trace_seq_puts(s, DEFAULT_SYS_FILTER_MESSAGE "\n");
658 mutex_unlock(&event_mutex); 662 mutex_unlock(&event_mutex);
659} 663}
660 664
@@ -1835,23 +1839,22 @@ static int create_system_filter(struct event_subsystem *system,
1835 return err; 1839 return err;
1836} 1840}
1837 1841
1842/* caller must hold event_mutex */
1838int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1843int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1839{ 1844{
1840 struct event_filter *filter; 1845 struct event_filter *filter;
1841 int err = 0; 1846 int err;
1842
1843 mutex_lock(&event_mutex);
1844 1847
1845 if (!strcmp(strstrip(filter_string), "0")) { 1848 if (!strcmp(strstrip(filter_string), "0")) {
1846 filter_disable(call); 1849 filter_disable(call);
1847 filter = call->filter; 1850 filter = call->filter;
1848 if (!filter) 1851 if (!filter)
1849 goto out_unlock; 1852 return 0;
1850 RCU_INIT_POINTER(call->filter, NULL); 1853 RCU_INIT_POINTER(call->filter, NULL);
1851 /* Make sure the filter is not being used */ 1854 /* Make sure the filter is not being used */
1852 synchronize_sched(); 1855 synchronize_sched();
1853 __free_filter(filter); 1856 __free_filter(filter);
1854 goto out_unlock; 1857 return 0;
1855 } 1858 }
1856 1859
1857 err = create_filter(call, filter_string, true, &filter); 1860 err = create_filter(call, filter_string, true, &filter);
@@ -1878,8 +1881,6 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1878 __free_filter(tmp); 1881 __free_filter(tmp);
1879 } 1882 }
1880 } 1883 }
1881out_unlock:
1882 mutex_unlock(&event_mutex);
1883 1884
1884 return err; 1885 return err;
1885} 1886}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c4d6d7191988..38fe1483c508 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -199,7 +199,7 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
199 return 0; 199 return 0;
200} 200}
201 201
202static struct tracer function_trace __read_mostly = 202static struct tracer function_trace __tracer_data =
203{ 203{
204 .name = "function", 204 .name = "function",
205 .init = function_trace_init, 205 .init = function_trace_init,
@@ -290,6 +290,21 @@ ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
290 trace_dump_stack(STACK_SKIP); 290 trace_dump_stack(STACK_SKIP);
291} 291}
292 292
293static void
294ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data)
295{
296 if (update_count(data))
297 ftrace_dump(DUMP_ALL);
298}
299
300/* Only dump the current CPU buffer. */
301static void
302ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data)
303{
304 if (update_count(data))
305 ftrace_dump(DUMP_ORIG);
306}
307
293static int 308static int
294ftrace_probe_print(const char *name, struct seq_file *m, 309ftrace_probe_print(const char *name, struct seq_file *m,
295 unsigned long ip, void *data) 310 unsigned long ip, void *data)
@@ -327,6 +342,20 @@ ftrace_stacktrace_print(struct seq_file *m, unsigned long ip,
327 return ftrace_probe_print("stacktrace", m, ip, data); 342 return ftrace_probe_print("stacktrace", m, ip, data);
328} 343}
329 344
345static int
346ftrace_dump_print(struct seq_file *m, unsigned long ip,
347 struct ftrace_probe_ops *ops, void *data)
348{
349 return ftrace_probe_print("dump", m, ip, data);
350}
351
352static int
353ftrace_cpudump_print(struct seq_file *m, unsigned long ip,
354 struct ftrace_probe_ops *ops, void *data)
355{
356 return ftrace_probe_print("cpudump", m, ip, data);
357}
358
330static struct ftrace_probe_ops traceon_count_probe_ops = { 359static struct ftrace_probe_ops traceon_count_probe_ops = {
331 .func = ftrace_traceon_count, 360 .func = ftrace_traceon_count,
332 .print = ftrace_traceon_print, 361 .print = ftrace_traceon_print,
@@ -342,6 +371,16 @@ static struct ftrace_probe_ops stacktrace_count_probe_ops = {
342 .print = ftrace_stacktrace_print, 371 .print = ftrace_stacktrace_print,
343}; 372};
344 373
374static struct ftrace_probe_ops dump_probe_ops = {
375 .func = ftrace_dump_probe,
376 .print = ftrace_dump_print,
377};
378
379static struct ftrace_probe_ops cpudump_probe_ops = {
380 .func = ftrace_cpudump_probe,
381 .print = ftrace_cpudump_print,
382};
383
345static struct ftrace_probe_ops traceon_probe_ops = { 384static struct ftrace_probe_ops traceon_probe_ops = {
346 .func = ftrace_traceon, 385 .func = ftrace_traceon,
347 .print = ftrace_traceon_print, 386 .print = ftrace_traceon_print,
@@ -425,6 +464,32 @@ ftrace_stacktrace_callback(struct ftrace_hash *hash,
425 param, enable); 464 param, enable);
426} 465}
427 466
467static int
468ftrace_dump_callback(struct ftrace_hash *hash,
469 char *glob, char *cmd, char *param, int enable)
470{
471 struct ftrace_probe_ops *ops;
472
473 ops = &dump_probe_ops;
474
475 /* Only dump once. */
476 return ftrace_trace_probe_callback(ops, hash, glob, cmd,
477 "1", enable);
478}
479
480static int
481ftrace_cpudump_callback(struct ftrace_hash *hash,
482 char *glob, char *cmd, char *param, int enable)
483{
484 struct ftrace_probe_ops *ops;
485
486 ops = &cpudump_probe_ops;
487
488 /* Only dump once. */
489 return ftrace_trace_probe_callback(ops, hash, glob, cmd,
490 "1", enable);
491}
492
428static struct ftrace_func_command ftrace_traceon_cmd = { 493static struct ftrace_func_command ftrace_traceon_cmd = {
429 .name = "traceon", 494 .name = "traceon",
430 .func = ftrace_trace_onoff_callback, 495 .func = ftrace_trace_onoff_callback,
@@ -440,6 +505,16 @@ static struct ftrace_func_command ftrace_stacktrace_cmd = {
440 .func = ftrace_stacktrace_callback, 505 .func = ftrace_stacktrace_callback,
441}; 506};
442 507
508static struct ftrace_func_command ftrace_dump_cmd = {
509 .name = "dump",
510 .func = ftrace_dump_callback,
511};
512
513static struct ftrace_func_command ftrace_cpudump_cmd = {
514 .name = "cpudump",
515 .func = ftrace_cpudump_callback,
516};
517
443static int __init init_func_cmd_traceon(void) 518static int __init init_func_cmd_traceon(void)
444{ 519{
445 int ret; 520 int ret;
@@ -450,13 +525,31 @@ static int __init init_func_cmd_traceon(void)
450 525
451 ret = register_ftrace_command(&ftrace_traceon_cmd); 526 ret = register_ftrace_command(&ftrace_traceon_cmd);
452 if (ret) 527 if (ret)
453 unregister_ftrace_command(&ftrace_traceoff_cmd); 528 goto out_free_traceoff;
454 529
455 ret = register_ftrace_command(&ftrace_stacktrace_cmd); 530 ret = register_ftrace_command(&ftrace_stacktrace_cmd);
456 if (ret) { 531 if (ret)
457 unregister_ftrace_command(&ftrace_traceoff_cmd); 532 goto out_free_traceon;
458 unregister_ftrace_command(&ftrace_traceon_cmd); 533
459 } 534 ret = register_ftrace_command(&ftrace_dump_cmd);
535 if (ret)
536 goto out_free_stacktrace;
537
538 ret = register_ftrace_command(&ftrace_cpudump_cmd);
539 if (ret)
540 goto out_free_dump;
541
542 return 0;
543
544 out_free_dump:
545 unregister_ftrace_command(&ftrace_dump_cmd);
546 out_free_stacktrace:
547 unregister_ftrace_command(&ftrace_stacktrace_cmd);
548 out_free_traceon:
549 unregister_ftrace_command(&ftrace_traceon_cmd);
550 out_free_traceoff:
551 unregister_ftrace_command(&ftrace_traceoff_cmd);
552
460 return ret; 553 return ret;
461} 554}
462#else 555#else
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8388bc99f2ee..b5c09242683d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -446,7 +446,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
446 446
447 /* First spaces to align center */ 447 /* First spaces to align center */
448 for (i = 0; i < spaces / 2; i++) { 448 for (i = 0; i < spaces / 2; i++) {
449 ret = trace_seq_printf(s, " "); 449 ret = trace_seq_putc(s, ' ');
450 if (!ret) 450 if (!ret)
451 return TRACE_TYPE_PARTIAL_LINE; 451 return TRACE_TYPE_PARTIAL_LINE;
452 } 452 }
@@ -457,7 +457,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
457 457
458 /* Last spaces to align center */ 458 /* Last spaces to align center */
459 for (i = 0; i < spaces - (spaces / 2); i++) { 459 for (i = 0; i < spaces - (spaces / 2); i++) {
460 ret = trace_seq_printf(s, " "); 460 ret = trace_seq_putc(s, ' ');
461 if (!ret) 461 if (!ret)
462 return TRACE_TYPE_PARTIAL_LINE; 462 return TRACE_TYPE_PARTIAL_LINE;
463 } 463 }
@@ -503,7 +503,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
503 ------------------------------------------ 503 ------------------------------------------
504 504
505 */ 505 */
506 ret = trace_seq_printf(s, 506 ret = trace_seq_puts(s,
507 " ------------------------------------------\n"); 507 " ------------------------------------------\n");
508 if (!ret) 508 if (!ret)
509 return TRACE_TYPE_PARTIAL_LINE; 509 return TRACE_TYPE_PARTIAL_LINE;
@@ -516,7 +516,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
516 if (ret == TRACE_TYPE_PARTIAL_LINE) 516 if (ret == TRACE_TYPE_PARTIAL_LINE)
517 return TRACE_TYPE_PARTIAL_LINE; 517 return TRACE_TYPE_PARTIAL_LINE;
518 518
519 ret = trace_seq_printf(s, " => "); 519 ret = trace_seq_puts(s, " => ");
520 if (!ret) 520 if (!ret)
521 return TRACE_TYPE_PARTIAL_LINE; 521 return TRACE_TYPE_PARTIAL_LINE;
522 522
@@ -524,7 +524,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
524 if (ret == TRACE_TYPE_PARTIAL_LINE) 524 if (ret == TRACE_TYPE_PARTIAL_LINE)
525 return TRACE_TYPE_PARTIAL_LINE; 525 return TRACE_TYPE_PARTIAL_LINE;
526 526
527 ret = trace_seq_printf(s, 527 ret = trace_seq_puts(s,
528 "\n ------------------------------------------\n\n"); 528 "\n ------------------------------------------\n\n");
529 if (!ret) 529 if (!ret)
530 return TRACE_TYPE_PARTIAL_LINE; 530 return TRACE_TYPE_PARTIAL_LINE;
@@ -645,7 +645,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
645 ret = print_graph_proc(s, pid); 645 ret = print_graph_proc(s, pid);
646 if (ret == TRACE_TYPE_PARTIAL_LINE) 646 if (ret == TRACE_TYPE_PARTIAL_LINE)
647 return TRACE_TYPE_PARTIAL_LINE; 647 return TRACE_TYPE_PARTIAL_LINE;
648 ret = trace_seq_printf(s, " | "); 648 ret = trace_seq_puts(s, " | ");
649 if (!ret) 649 if (!ret)
650 return TRACE_TYPE_PARTIAL_LINE; 650 return TRACE_TYPE_PARTIAL_LINE;
651 } 651 }
@@ -657,9 +657,9 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
657 return ret; 657 return ret;
658 658
659 if (type == TRACE_GRAPH_ENT) 659 if (type == TRACE_GRAPH_ENT)
660 ret = trace_seq_printf(s, "==========>"); 660 ret = trace_seq_puts(s, "==========>");
661 else 661 else
662 ret = trace_seq_printf(s, "<=========="); 662 ret = trace_seq_puts(s, "<==========");
663 663
664 if (!ret) 664 if (!ret)
665 return TRACE_TYPE_PARTIAL_LINE; 665 return TRACE_TYPE_PARTIAL_LINE;
@@ -668,7 +668,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
668 if (ret != TRACE_TYPE_HANDLED) 668 if (ret != TRACE_TYPE_HANDLED)
669 return ret; 669 return ret;
670 670
671 ret = trace_seq_printf(s, "\n"); 671 ret = trace_seq_putc(s, '\n');
672 672
673 if (!ret) 673 if (!ret)
674 return TRACE_TYPE_PARTIAL_LINE; 674 return TRACE_TYPE_PARTIAL_LINE;
@@ -705,13 +705,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
705 len += strlen(nsecs_str); 705 len += strlen(nsecs_str);
706 } 706 }
707 707
708 ret = trace_seq_printf(s, " us "); 708 ret = trace_seq_puts(s, " us ");
709 if (!ret) 709 if (!ret)
710 return TRACE_TYPE_PARTIAL_LINE; 710 return TRACE_TYPE_PARTIAL_LINE;
711 711
712 /* Print remaining spaces to fit the row's width */ 712 /* Print remaining spaces to fit the row's width */
713 for (i = len; i < 7; i++) { 713 for (i = len; i < 7; i++) {
714 ret = trace_seq_printf(s, " "); 714 ret = trace_seq_putc(s, ' ');
715 if (!ret) 715 if (!ret)
716 return TRACE_TYPE_PARTIAL_LINE; 716 return TRACE_TYPE_PARTIAL_LINE;
717 } 717 }
@@ -731,13 +731,13 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
731 /* No real adata, just filling the column with spaces */ 731 /* No real adata, just filling the column with spaces */
732 switch (duration) { 732 switch (duration) {
733 case DURATION_FILL_FULL: 733 case DURATION_FILL_FULL:
734 ret = trace_seq_printf(s, " | "); 734 ret = trace_seq_puts(s, " | ");
735 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 735 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
736 case DURATION_FILL_START: 736 case DURATION_FILL_START:
737 ret = trace_seq_printf(s, " "); 737 ret = trace_seq_puts(s, " ");
738 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 738 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
739 case DURATION_FILL_END: 739 case DURATION_FILL_END:
740 ret = trace_seq_printf(s, " |"); 740 ret = trace_seq_puts(s, " |");
741 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 741 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
742 } 742 }
743 743
@@ -745,10 +745,10 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
745 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { 745 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
746 /* Duration exceeded 100 msecs */ 746 /* Duration exceeded 100 msecs */
747 if (duration > 100000ULL) 747 if (duration > 100000ULL)
748 ret = trace_seq_printf(s, "! "); 748 ret = trace_seq_puts(s, "! ");
749 /* Duration exceeded 10 msecs */ 749 /* Duration exceeded 10 msecs */
750 else if (duration > 10000ULL) 750 else if (duration > 10000ULL)
751 ret = trace_seq_printf(s, "+ "); 751 ret = trace_seq_puts(s, "+ ");
752 } 752 }
753 753
754 /* 754 /*
@@ -757,7 +757,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
757 * to fill out the space. 757 * to fill out the space.
758 */ 758 */
759 if (ret == -1) 759 if (ret == -1)
760 ret = trace_seq_printf(s, " "); 760 ret = trace_seq_puts(s, " ");
761 761
762 /* Catching here any failure happenned above */ 762 /* Catching here any failure happenned above */
763 if (!ret) 763 if (!ret)
@@ -767,7 +767,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
767 if (ret != TRACE_TYPE_HANDLED) 767 if (ret != TRACE_TYPE_HANDLED)
768 return ret; 768 return ret;
769 769
770 ret = trace_seq_printf(s, "| "); 770 ret = trace_seq_puts(s, "| ");
771 if (!ret) 771 if (!ret)
772 return TRACE_TYPE_PARTIAL_LINE; 772 return TRACE_TYPE_PARTIAL_LINE;
773 773
@@ -817,7 +817,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
817 817
818 /* Function */ 818 /* Function */
819 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 819 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
820 ret = trace_seq_printf(s, " "); 820 ret = trace_seq_putc(s, ' ');
821 if (!ret) 821 if (!ret)
822 return TRACE_TYPE_PARTIAL_LINE; 822 return TRACE_TYPE_PARTIAL_LINE;
823 } 823 }
@@ -858,7 +858,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
858 858
859 /* Function */ 859 /* Function */
860 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 860 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
861 ret = trace_seq_printf(s, " "); 861 ret = trace_seq_putc(s, ' ');
862 if (!ret) 862 if (!ret)
863 return TRACE_TYPE_PARTIAL_LINE; 863 return TRACE_TYPE_PARTIAL_LINE;
864 } 864 }
@@ -917,7 +917,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
917 if (ret == TRACE_TYPE_PARTIAL_LINE) 917 if (ret == TRACE_TYPE_PARTIAL_LINE)
918 return TRACE_TYPE_PARTIAL_LINE; 918 return TRACE_TYPE_PARTIAL_LINE;
919 919
920 ret = trace_seq_printf(s, " | "); 920 ret = trace_seq_puts(s, " | ");
921 if (!ret) 921 if (!ret)
922 return TRACE_TYPE_PARTIAL_LINE; 922 return TRACE_TYPE_PARTIAL_LINE;
923 } 923 }
@@ -1117,7 +1117,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1117 1117
1118 /* Closing brace */ 1118 /* Closing brace */
1119 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { 1119 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
1120 ret = trace_seq_printf(s, " "); 1120 ret = trace_seq_putc(s, ' ');
1121 if (!ret) 1121 if (!ret)
1122 return TRACE_TYPE_PARTIAL_LINE; 1122 return TRACE_TYPE_PARTIAL_LINE;
1123 } 1123 }
@@ -1129,7 +1129,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1129 * belongs to, write out the function name. 1129 * belongs to, write out the function name.
1130 */ 1130 */
1131 if (func_match) { 1131 if (func_match) {
1132 ret = trace_seq_printf(s, "}\n"); 1132 ret = trace_seq_puts(s, "}\n");
1133 if (!ret) 1133 if (!ret)
1134 return TRACE_TYPE_PARTIAL_LINE; 1134 return TRACE_TYPE_PARTIAL_LINE;
1135 } else { 1135 } else {
@@ -1179,13 +1179,13 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1179 /* Indentation */ 1179 /* Indentation */
1180 if (depth > 0) 1180 if (depth > 0)
1181 for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { 1181 for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) {
1182 ret = trace_seq_printf(s, " "); 1182 ret = trace_seq_putc(s, ' ');
1183 if (!ret) 1183 if (!ret)
1184 return TRACE_TYPE_PARTIAL_LINE; 1184 return TRACE_TYPE_PARTIAL_LINE;
1185 } 1185 }
1186 1186
1187 /* The comment */ 1187 /* The comment */
1188 ret = trace_seq_printf(s, "/* "); 1188 ret = trace_seq_puts(s, "/* ");
1189 if (!ret) 1189 if (!ret)
1190 return TRACE_TYPE_PARTIAL_LINE; 1190 return TRACE_TYPE_PARTIAL_LINE;
1191 1191
@@ -1216,7 +1216,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1216 s->len--; 1216 s->len--;
1217 } 1217 }
1218 1218
1219 ret = trace_seq_printf(s, " */\n"); 1219 ret = trace_seq_puts(s, " */\n");
1220 if (!ret) 1220 if (!ret)
1221 return TRACE_TYPE_PARTIAL_LINE; 1221 return TRACE_TYPE_PARTIAL_LINE;
1222 1222
@@ -1448,7 +1448,7 @@ static struct trace_event graph_trace_ret_event = {
1448 .funcs = &graph_functions 1448 .funcs = &graph_functions
1449}; 1449};
1450 1450
1451static struct tracer graph_trace __read_mostly = { 1451static struct tracer graph_trace __tracer_data = {
1452 .name = "function_graph", 1452 .name = "function_graph",
1453 .open = graph_trace_open, 1453 .open = graph_trace_open,
1454 .pipe_open = graph_trace_open, 1454 .pipe_open = graph_trace_open,
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index b19d065a28cb..2aefbee93a6d 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -373,7 +373,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
373 struct trace_array_cpu *data; 373 struct trace_array_cpu *data;
374 unsigned long flags; 374 unsigned long flags;
375 375
376 if (likely(!tracer_enabled)) 376 if (!tracer_enabled || !tracing_is_enabled())
377 return; 377 return;
378 378
379 cpu = raw_smp_processor_id(); 379 cpu = raw_smp_processor_id();
@@ -416,7 +416,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
416 else 416 else
417 return; 417 return;
418 418
419 if (!tracer_enabled) 419 if (!tracer_enabled || !tracing_is_enabled())
420 return; 420 return;
421 421
422 data = per_cpu_ptr(tr->trace_buffer.data, cpu); 422 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9f46e98ba8f2..243f6834d026 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -35,12 +35,17 @@ struct trace_probe {
35 const char *symbol; /* symbol name */ 35 const char *symbol; /* symbol name */
36 struct ftrace_event_class class; 36 struct ftrace_event_class class;
37 struct ftrace_event_call call; 37 struct ftrace_event_call call;
38 struct ftrace_event_file * __rcu *files; 38 struct list_head files;
39 ssize_t size; /* trace entry size */ 39 ssize_t size; /* trace entry size */
40 unsigned int nr_args; 40 unsigned int nr_args;
41 struct probe_arg args[]; 41 struct probe_arg args[];
42}; 42};
43 43
44struct event_file_link {
45 struct ftrace_event_file *file;
46 struct list_head list;
47};
48
44#define SIZEOF_TRACE_PROBE(n) \ 49#define SIZEOF_TRACE_PROBE(n) \
45 (offsetof(struct trace_probe, args) + \ 50 (offsetof(struct trace_probe, args) + \
46 (sizeof(struct probe_arg) * (n))) 51 (sizeof(struct probe_arg) * (n)))
@@ -90,7 +95,7 @@ static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp)
90} 95}
91 96
92static int register_probe_event(struct trace_probe *tp); 97static int register_probe_event(struct trace_probe *tp);
93static void unregister_probe_event(struct trace_probe *tp); 98static int unregister_probe_event(struct trace_probe *tp);
94 99
95static DEFINE_MUTEX(probe_lock); 100static DEFINE_MUTEX(probe_lock);
96static LIST_HEAD(probe_list); 101static LIST_HEAD(probe_list);
@@ -150,6 +155,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
150 goto error; 155 goto error;
151 156
152 INIT_LIST_HEAD(&tp->list); 157 INIT_LIST_HEAD(&tp->list);
158 INIT_LIST_HEAD(&tp->files);
153 return tp; 159 return tp;
154error: 160error:
155 kfree(tp->call.name); 161 kfree(tp->call.name);
@@ -183,25 +189,6 @@ static struct trace_probe *find_trace_probe(const char *event,
183 return NULL; 189 return NULL;
184} 190}
185 191
186static int trace_probe_nr_files(struct trace_probe *tp)
187{
188 struct ftrace_event_file **file;
189 int ret = 0;
190
191 /*
192 * Since all tp->files updater is protected by probe_enable_lock,
193 * we don't need to lock an rcu_read_lock.
194 */
195 file = rcu_dereference_raw(tp->files);
196 if (file)
197 while (*(file++))
198 ret++;
199
200 return ret;
201}
202
203static DEFINE_MUTEX(probe_enable_lock);
204
205/* 192/*
206 * Enable trace_probe 193 * Enable trace_probe
207 * if the file is NULL, enable "perf" handler, or enable "trace" handler. 194 * if the file is NULL, enable "perf" handler, or enable "trace" handler.
@@ -211,67 +198,42 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
211{ 198{
212 int ret = 0; 199 int ret = 0;
213 200
214 mutex_lock(&probe_enable_lock);
215
216 if (file) { 201 if (file) {
217 struct ftrace_event_file **new, **old; 202 struct event_file_link *link;
218 int n = trace_probe_nr_files(tp); 203
219 204 link = kmalloc(sizeof(*link), GFP_KERNEL);
220 old = rcu_dereference_raw(tp->files); 205 if (!link) {
221 /* 1 is for new one and 1 is for stopper */
222 new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *),
223 GFP_KERNEL);
224 if (!new) {
225 ret = -ENOMEM; 206 ret = -ENOMEM;
226 goto out_unlock; 207 goto out;
227 } 208 }
228 memcpy(new, old, n * sizeof(struct ftrace_event_file *));
229 new[n] = file;
230 /* The last one keeps a NULL */
231 209
232 rcu_assign_pointer(tp->files, new); 210 link->file = file;
233 tp->flags |= TP_FLAG_TRACE; 211 list_add_tail_rcu(&link->list, &tp->files);
234 212
235 if (old) { 213 tp->flags |= TP_FLAG_TRACE;
236 /* Make sure the probe is done with old files */
237 synchronize_sched();
238 kfree(old);
239 }
240 } else 214 } else
241 tp->flags |= TP_FLAG_PROFILE; 215 tp->flags |= TP_FLAG_PROFILE;
242 216
243 if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && 217 if (trace_probe_is_registered(tp) && !trace_probe_has_gone(tp)) {
244 !trace_probe_has_gone(tp)) {
245 if (trace_probe_is_return(tp)) 218 if (trace_probe_is_return(tp))
246 ret = enable_kretprobe(&tp->rp); 219 ret = enable_kretprobe(&tp->rp);
247 else 220 else
248 ret = enable_kprobe(&tp->rp.kp); 221 ret = enable_kprobe(&tp->rp.kp);
249 } 222 }
250 223 out:
251 out_unlock:
252 mutex_unlock(&probe_enable_lock);
253
254 return ret; 224 return ret;
255} 225}
256 226
257static int 227static struct event_file_link *
258trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file) 228find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
259{ 229{
260 struct ftrace_event_file **files; 230 struct event_file_link *link;
261 int i;
262 231
263 /* 232 list_for_each_entry(link, &tp->files, list)
264 * Since all tp->files updater is protected by probe_enable_lock, 233 if (link->file == file)
265 * we don't need to lock an rcu_read_lock. 234 return link;
266 */
267 files = rcu_dereference_raw(tp->files);
268 if (files) {
269 for (i = 0; files[i]; i++)
270 if (files[i] == file)
271 return i;
272 }
273 235
274 return -1; 236 return NULL;
275} 237}
276 238
277/* 239/*
@@ -281,43 +243,23 @@ trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file)
281static int 243static int
282disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) 244disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
283{ 245{
246 struct event_file_link *link = NULL;
247 int wait = 0;
284 int ret = 0; 248 int ret = 0;
285 249
286 mutex_lock(&probe_enable_lock);
287
288 if (file) { 250 if (file) {
289 struct ftrace_event_file **new, **old; 251 link = find_event_file_link(tp, file);
290 int n = trace_probe_nr_files(tp); 252 if (!link) {
291 int i, j;
292
293 old = rcu_dereference_raw(tp->files);
294 if (n == 0 || trace_probe_file_index(tp, file) < 0) {
295 ret = -EINVAL; 253 ret = -EINVAL;
296 goto out_unlock; 254 goto out;
297 } 255 }
298 256
299 if (n == 1) { /* Remove the last file */ 257 list_del_rcu(&link->list);
300 tp->flags &= ~TP_FLAG_TRACE; 258 wait = 1;
301 new = NULL; 259 if (!list_empty(&tp->files))
302 } else { 260 goto out;
303 new = kzalloc(n * sizeof(struct ftrace_event_file *),
304 GFP_KERNEL);
305 if (!new) {
306 ret = -ENOMEM;
307 goto out_unlock;
308 }
309
310 /* This copy & check loop copies the NULL stopper too */
311 for (i = 0, j = 0; j < n && i < n + 1; i++)
312 if (old[i] != file)
313 new[j++] = old[i];
314 }
315
316 rcu_assign_pointer(tp->files, new);
317 261
318 /* Make sure the probe is done with old files */ 262 tp->flags &= ~TP_FLAG_TRACE;
319 synchronize_sched();
320 kfree(old);
321 } else 263 } else
322 tp->flags &= ~TP_FLAG_PROFILE; 264 tp->flags &= ~TP_FLAG_PROFILE;
323 265
@@ -326,10 +268,21 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
326 disable_kretprobe(&tp->rp); 268 disable_kretprobe(&tp->rp);
327 else 269 else
328 disable_kprobe(&tp->rp.kp); 270 disable_kprobe(&tp->rp.kp);
271 wait = 1;
272 }
273 out:
274 if (wait) {
275 /*
276 * Synchronize with kprobe_trace_func/kretprobe_trace_func
277 * to ensure disabled (all running handlers are finished).
278 * This is not only for kfree(), but also the caller,
279 * trace_remove_event_call() supposes it for releasing
280 * event_call related objects, which will be accessed in
281 * the kprobe_trace_func/kretprobe_trace_func.
282 */
283 synchronize_sched();
284 kfree(link); /* Ignored if link == NULL */
329 } 285 }
330
331 out_unlock:
332 mutex_unlock(&probe_enable_lock);
333 286
334 return ret; 287 return ret;
335} 288}
@@ -398,9 +351,12 @@ static int unregister_trace_probe(struct trace_probe *tp)
398 if (trace_probe_is_enabled(tp)) 351 if (trace_probe_is_enabled(tp))
399 return -EBUSY; 352 return -EBUSY;
400 353
354 /* Will fail if probe is being used by ftrace or perf */
355 if (unregister_probe_event(tp))
356 return -EBUSY;
357
401 __unregister_trace_probe(tp); 358 __unregister_trace_probe(tp);
402 list_del(&tp->list); 359 list_del(&tp->list);
403 unregister_probe_event(tp);
404 360
405 return 0; 361 return 0;
406} 362}
@@ -679,7 +635,9 @@ static int release_all_trace_probes(void)
679 /* TODO: Use batch unregistration */ 635 /* TODO: Use batch unregistration */
680 while (!list_empty(&probe_list)) { 636 while (!list_empty(&probe_list)) {
681 tp = list_entry(probe_list.next, struct trace_probe, list); 637 tp = list_entry(probe_list.next, struct trace_probe, list);
682 unregister_trace_probe(tp); 638 ret = unregister_trace_probe(tp);
639 if (ret)
640 goto end;
683 free_trace_probe(tp); 641 free_trace_probe(tp);
684 } 642 }
685 643
@@ -885,20 +843,10 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
885static __kprobes void 843static __kprobes void
886kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) 844kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs)
887{ 845{
888 /* 846 struct event_file_link *link;
889 * Note: preempt is already disabled around the kprobe handler.
890 * However, we still need an smp_read_barrier_depends() corresponding
891 * to smp_wmb() in rcu_assign_pointer() to access the pointer.
892 */
893 struct ftrace_event_file **file = rcu_dereference_raw(tp->files);
894
895 if (unlikely(!file))
896 return;
897 847
898 while (*file) { 848 list_for_each_entry_rcu(link, &tp->files, list)
899 __kprobe_trace_func(tp, regs, *file); 849 __kprobe_trace_func(tp, regs, link->file);
900 file++;
901 }
902} 850}
903 851
904/* Kretprobe handler */ 852/* Kretprobe handler */
@@ -945,20 +893,10 @@ static __kprobes void
945kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, 893kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
946 struct pt_regs *regs) 894 struct pt_regs *regs)
947{ 895{
948 /* 896 struct event_file_link *link;
949 * Note: preempt is already disabled around the kprobe handler.
950 * However, we still need an smp_read_barrier_depends() corresponding
951 * to smp_wmb() in rcu_assign_pointer() to access the pointer.
952 */
953 struct ftrace_event_file **file = rcu_dereference_raw(tp->files);
954
955 if (unlikely(!file))
956 return;
957 897
958 while (*file) { 898 list_for_each_entry_rcu(link, &tp->files, list)
959 __kretprobe_trace_func(tp, ri, regs, *file); 899 __kretprobe_trace_func(tp, ri, regs, link->file);
960 file++;
961 }
962} 900}
963 901
964/* Event entry printers */ 902/* Event entry printers */
@@ -1157,13 +1095,14 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)
1157 int size, __size, dsize; 1095 int size, __size, dsize;
1158 int rctx; 1096 int rctx;
1159 1097
1098 head = this_cpu_ptr(call->perf_events);
1099 if (hlist_empty(head))
1100 return;
1101
1160 dsize = __get_data_size(tp, regs); 1102 dsize = __get_data_size(tp, regs);
1161 __size = sizeof(*entry) + tp->size + dsize; 1103 __size = sizeof(*entry) + tp->size + dsize;
1162 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1104 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1163 size -= sizeof(u32); 1105 size -= sizeof(u32);
1164 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1165 "profile buffer not large enough"))
1166 return;
1167 1106
1168 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1107 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
1169 if (!entry) 1108 if (!entry)
@@ -1172,10 +1111,7 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)
1172 entry->ip = (unsigned long)tp->rp.kp.addr; 1111 entry->ip = (unsigned long)tp->rp.kp.addr;
1173 memset(&entry[1], 0, dsize); 1112 memset(&entry[1], 0, dsize);
1174 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1113 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1175 1114 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
1176 head = this_cpu_ptr(call->perf_events);
1177 perf_trace_buf_submit(entry, size, rctx,
1178 entry->ip, 1, regs, head, NULL);
1179} 1115}
1180 1116
1181/* Kretprobe profile handler */ 1117/* Kretprobe profile handler */
@@ -1189,13 +1125,14 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,
1189 int size, __size, dsize; 1125 int size, __size, dsize;
1190 int rctx; 1126 int rctx;
1191 1127
1128 head = this_cpu_ptr(call->perf_events);
1129 if (hlist_empty(head))
1130 return;
1131
1192 dsize = __get_data_size(tp, regs); 1132 dsize = __get_data_size(tp, regs);
1193 __size = sizeof(*entry) + tp->size + dsize; 1133 __size = sizeof(*entry) + tp->size + dsize;
1194 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1134 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1195 size -= sizeof(u32); 1135 size -= sizeof(u32);
1196 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1197 "profile buffer not large enough"))
1198 return;
1199 1136
1200 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1137 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
1201 if (!entry) 1138 if (!entry)
@@ -1204,13 +1141,16 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,
1204 entry->func = (unsigned long)tp->rp.kp.addr; 1141 entry->func = (unsigned long)tp->rp.kp.addr;
1205 entry->ret_ip = (unsigned long)ri->ret_addr; 1142 entry->ret_ip = (unsigned long)ri->ret_addr;
1206 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1143 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1207 1144 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
1208 head = this_cpu_ptr(call->perf_events);
1209 perf_trace_buf_submit(entry, size, rctx,
1210 entry->ret_ip, 1, regs, head, NULL);
1211} 1145}
1212#endif /* CONFIG_PERF_EVENTS */ 1146#endif /* CONFIG_PERF_EVENTS */
1213 1147
1148/*
1149 * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex.
1150 *
1151 * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe
1152 * lockless, but we can't race with this __init function.
1153 */
1214static __kprobes 1154static __kprobes
1215int kprobe_register(struct ftrace_event_call *event, 1155int kprobe_register(struct ftrace_event_call *event,
1216 enum trace_reg type, void *data) 1156 enum trace_reg type, void *data)
@@ -1312,11 +1252,15 @@ static int register_probe_event(struct trace_probe *tp)
1312 return ret; 1252 return ret;
1313} 1253}
1314 1254
1315static void unregister_probe_event(struct trace_probe *tp) 1255static int unregister_probe_event(struct trace_probe *tp)
1316{ 1256{
1257 int ret;
1258
1317 /* tp->event is unregistered in trace_remove_event_call() */ 1259 /* tp->event is unregistered in trace_remove_event_call() */
1318 trace_remove_event_call(&tp->call); 1260 ret = trace_remove_event_call(&tp->call);
1319 kfree(tp->call.print_fmt); 1261 if (!ret)
1262 kfree(tp->call.print_fmt);
1263 return ret;
1320} 1264}
1321 1265
1322/* Make a debugfs interface for controlling probe points */ 1266/* Make a debugfs interface for controlling probe points */
@@ -1376,6 +1320,10 @@ find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr)
1376 return NULL; 1320 return NULL;
1377} 1321}
1378 1322
1323/*
1324 * Nobody but us can call enable_trace_probe/disable_trace_probe at this
1325 * stage, we can do this lockless.
1326 */
1379static __init int kprobe_trace_self_tests_init(void) 1327static __init int kprobe_trace_self_tests_init(void)
1380{ 1328{
1381 int ret, warn = 0; 1329 int ret, warn = 0;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index a5e8f4878bfa..b3dcfb2f0fef 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -90,7 +90,7 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
90 if (drv) 90 if (drv)
91 ret += trace_seq_printf(s, " %s\n", drv->name); 91 ret += trace_seq_printf(s, " %s\n", drv->name);
92 else 92 else
93 ret += trace_seq_printf(s, " \n"); 93 ret += trace_seq_puts(s, " \n");
94 return ret; 94 return ret;
95} 95}
96 96
@@ -107,7 +107,7 @@ static void mmio_pipe_open(struct trace_iterator *iter)
107 struct header_iter *hiter; 107 struct header_iter *hiter;
108 struct trace_seq *s = &iter->seq; 108 struct trace_seq *s = &iter->seq;
109 109
110 trace_seq_printf(s, "VERSION 20070824\n"); 110 trace_seq_puts(s, "VERSION 20070824\n");
111 111
112 hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); 112 hiter = kzalloc(sizeof(*hiter), GFP_KERNEL);
113 if (!hiter) 113 if (!hiter)
@@ -209,7 +209,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
209 (rw->value >> 0) & 0xff, rw->pc, 0); 209 (rw->value >> 0) & 0xff, rw->pc, 0);
210 break; 210 break;
211 default: 211 default:
212 ret = trace_seq_printf(s, "rw what?\n"); 212 ret = trace_seq_puts(s, "rw what?\n");
213 break; 213 break;
214 } 214 }
215 if (ret) 215 if (ret)
@@ -245,7 +245,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
245 secs, usec_rem, m->map_id, 0UL, 0); 245 secs, usec_rem, m->map_id, 0UL, 0);
246 break; 246 break;
247 default: 247 default:
248 ret = trace_seq_printf(s, "map what?\n"); 248 ret = trace_seq_puts(s, "map what?\n");
249 break; 249 break;
250 } 250 }
251 if (ret) 251 if (ret)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index bb922d9ee51b..34e7cbac0c9c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -78,7 +78,7 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
78 78
79 trace_assign_type(field, entry); 79 trace_assign_type(field, entry);
80 80
81 ret = trace_seq_printf(s, "%s", field->buf); 81 ret = trace_seq_puts(s, field->buf);
82 if (!ret) 82 if (!ret)
83 return TRACE_TYPE_PARTIAL_LINE; 83 return TRACE_TYPE_PARTIAL_LINE;
84 84
@@ -558,14 +558,14 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
558 if (ret) 558 if (ret)
559 ret = trace_seq_puts(s, "??"); 559 ret = trace_seq_puts(s, "??");
560 if (ret) 560 if (ret)
561 ret = trace_seq_puts(s, "\n"); 561 ret = trace_seq_putc(s, '\n');
562 continue; 562 continue;
563 } 563 }
564 if (!ret) 564 if (!ret)
565 break; 565 break;
566 if (ret) 566 if (ret)
567 ret = seq_print_user_ip(s, mm, ip, sym_flags); 567 ret = seq_print_user_ip(s, mm, ip, sym_flags);
568 ret = trace_seq_puts(s, "\n"); 568 ret = trace_seq_putc(s, '\n');
569 } 569 }
570 570
571 if (mm) 571 if (mm)
@@ -579,7 +579,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
579 int ret; 579 int ret;
580 580
581 if (!ip) 581 if (!ip)
582 return trace_seq_printf(s, "0"); 582 return trace_seq_putc(s, '0');
583 583
584 if (sym_flags & TRACE_ITER_SYM_OFFSET) 584 if (sym_flags & TRACE_ITER_SYM_OFFSET)
585 ret = seq_print_sym_offset(s, "%s", ip); 585 ret = seq_print_sym_offset(s, "%s", ip);
@@ -964,14 +964,14 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
964 goto partial; 964 goto partial;
965 965
966 if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { 966 if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
967 if (!trace_seq_printf(s, " <-")) 967 if (!trace_seq_puts(s, " <-"))
968 goto partial; 968 goto partial;
969 if (!seq_print_ip_sym(s, 969 if (!seq_print_ip_sym(s,
970 field->parent_ip, 970 field->parent_ip,
971 flags)) 971 flags))
972 goto partial; 972 goto partial;
973 } 973 }
974 if (!trace_seq_printf(s, "\n")) 974 if (!trace_seq_putc(s, '\n'))
975 goto partial; 975 goto partial;
976 976
977 return TRACE_TYPE_HANDLED; 977 return TRACE_TYPE_HANDLED;
@@ -1210,7 +1210,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1210 1210
1211 if (!seq_print_ip_sym(s, *p, flags)) 1211 if (!seq_print_ip_sym(s, *p, flags))
1212 goto partial; 1212 goto partial;
1213 if (!trace_seq_puts(s, "\n")) 1213 if (!trace_seq_putc(s, '\n'))
1214 goto partial; 1214 goto partial;
1215 } 1215 }
1216 1216
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 2901e3b88590..a7329b7902f8 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -640,13 +640,20 @@ out:
640 * Enable ftrace, sleep 1/10 second, and then read the trace 640 * Enable ftrace, sleep 1/10 second, and then read the trace
641 * buffer to see if all is in order. 641 * buffer to see if all is in order.
642 */ 642 */
643int 643__init int
644trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) 644trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
645{ 645{
646 int save_ftrace_enabled = ftrace_enabled; 646 int save_ftrace_enabled = ftrace_enabled;
647 unsigned long count; 647 unsigned long count;
648 int ret; 648 int ret;
649 649
650#ifdef CONFIG_DYNAMIC_FTRACE
651 if (ftrace_filter_param) {
652 printk(KERN_CONT " ... kernel command line filter set: force PASS ... ");
653 return 0;
654 }
655#endif
656
650 /* make sure msleep has been recorded */ 657 /* make sure msleep has been recorded */
651 msleep(1); 658 msleep(1);
652 659
@@ -727,13 +734,20 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
727 * Pretty much the same than for the function tracer from which the selftest 734 * Pretty much the same than for the function tracer from which the selftest
728 * has been borrowed. 735 * has been borrowed.
729 */ 736 */
730int 737__init int
731trace_selftest_startup_function_graph(struct tracer *trace, 738trace_selftest_startup_function_graph(struct tracer *trace,
732 struct trace_array *tr) 739 struct trace_array *tr)
733{ 740{
734 int ret; 741 int ret;
735 unsigned long count; 742 unsigned long count;
736 743
744#ifdef CONFIG_DYNAMIC_FTRACE
745 if (ftrace_filter_param) {
746 printk(KERN_CONT " ... kernel command line filter set: force PASS ... ");
747 return 0;
748 }
749#endif
750
737 /* 751 /*
738 * Simulate the init() callback but we attach a watchdog callback 752 * Simulate the init() callback but we attach a watchdog callback
739 * to detect and recover from possible hangs 753 * to detect and recover from possible hangs
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8f2ac73c7a5f..8fd03657bc7d 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -175,7 +175,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
175 entry = syscall_nr_to_meta(syscall); 175 entry = syscall_nr_to_meta(syscall);
176 176
177 if (!entry) { 177 if (!entry) {
178 trace_seq_printf(s, "\n"); 178 trace_seq_putc(s, '\n');
179 return TRACE_TYPE_HANDLED; 179 return TRACE_TYPE_HANDLED;
180 } 180 }
181 181
@@ -306,6 +306,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
306 struct syscall_metadata *sys_data; 306 struct syscall_metadata *sys_data;
307 struct ring_buffer_event *event; 307 struct ring_buffer_event *event;
308 struct ring_buffer *buffer; 308 struct ring_buffer *buffer;
309 unsigned long irq_flags;
310 int pc;
309 int syscall_nr; 311 int syscall_nr;
310 int size; 312 int size;
311 313
@@ -321,9 +323,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
321 323
322 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 324 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
323 325
326 local_save_flags(irq_flags);
327 pc = preempt_count();
328
324 buffer = tr->trace_buffer.buffer; 329 buffer = tr->trace_buffer.buffer;
325 event = trace_buffer_lock_reserve(buffer, 330 event = trace_buffer_lock_reserve(buffer,
326 sys_data->enter_event->event.type, size, 0, 0); 331 sys_data->enter_event->event.type, size, irq_flags, pc);
327 if (!event) 332 if (!event)
328 return; 333 return;
329 334
@@ -333,7 +338,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
333 338
334 if (!filter_current_check_discard(buffer, sys_data->enter_event, 339 if (!filter_current_check_discard(buffer, sys_data->enter_event,
335 entry, event)) 340 entry, event))
336 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 341 trace_current_buffer_unlock_commit(buffer, event,
342 irq_flags, pc);
337} 343}
338 344
339static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) 345static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
@@ -343,6 +349,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
343 struct syscall_metadata *sys_data; 349 struct syscall_metadata *sys_data;
344 struct ring_buffer_event *event; 350 struct ring_buffer_event *event;
345 struct ring_buffer *buffer; 351 struct ring_buffer *buffer;
352 unsigned long irq_flags;
353 int pc;
346 int syscall_nr; 354 int syscall_nr;
347 355
348 syscall_nr = trace_get_syscall_nr(current, regs); 356 syscall_nr = trace_get_syscall_nr(current, regs);
@@ -355,9 +363,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
355 if (!sys_data) 363 if (!sys_data)
356 return; 364 return;
357 365
366 local_save_flags(irq_flags);
367 pc = preempt_count();
368
358 buffer = tr->trace_buffer.buffer; 369 buffer = tr->trace_buffer.buffer;
359 event = trace_buffer_lock_reserve(buffer, 370 event = trace_buffer_lock_reserve(buffer,
360 sys_data->exit_event->event.type, sizeof(*entry), 0, 0); 371 sys_data->exit_event->event.type, sizeof(*entry),
372 irq_flags, pc);
361 if (!event) 373 if (!event)
362 return; 374 return;
363 375
@@ -367,7 +379,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
367 379
368 if (!filter_current_check_discard(buffer, sys_data->exit_event, 380 if (!filter_current_check_discard(buffer, sys_data->exit_event,
369 entry, event)) 381 entry, event))
370 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 382 trace_current_buffer_unlock_commit(buffer, event,
383 irq_flags, pc);
371} 384}
372 385
373static int reg_event_syscall_enter(struct ftrace_event_file *file, 386static int reg_event_syscall_enter(struct ftrace_event_file *file,
@@ -553,15 +566,15 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
553 if (!sys_data) 566 if (!sys_data)
554 return; 567 return;
555 568
569 head = this_cpu_ptr(sys_data->enter_event->perf_events);
570 if (hlist_empty(head))
571 return;
572
556 /* get the size after alignment with the u32 buffer size field */ 573 /* get the size after alignment with the u32 buffer size field */
557 size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); 574 size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
558 size = ALIGN(size + sizeof(u32), sizeof(u64)); 575 size = ALIGN(size + sizeof(u32), sizeof(u64));
559 size -= sizeof(u32); 576 size -= sizeof(u32);
560 577
561 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
562 "perf buffer not large enough"))
563 return;
564
565 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, 578 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
566 sys_data->enter_event->event.type, regs, &rctx); 579 sys_data->enter_event->event.type, regs, &rctx);
567 if (!rec) 580 if (!rec)
@@ -570,8 +583,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
570 rec->nr = syscall_nr; 583 rec->nr = syscall_nr;
571 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 584 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
572 (unsigned long *)&rec->args); 585 (unsigned long *)&rec->args);
573
574 head = this_cpu_ptr(sys_data->enter_event->perf_events);
575 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 586 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
576} 587}
577 588
@@ -629,18 +640,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
629 if (!sys_data) 640 if (!sys_data)
630 return; 641 return;
631 642
643 head = this_cpu_ptr(sys_data->exit_event->perf_events);
644 if (hlist_empty(head))
645 return;
646
632 /* We can probably do that at build time */ 647 /* We can probably do that at build time */
633 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); 648 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
634 size -= sizeof(u32); 649 size -= sizeof(u32);
635 650
636 /*
637 * Impossible, but be paranoid with the future
638 * How to put this check outside runtime?
639 */
640 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
641 "exit event has grown above perf buffer size"))
642 return;
643
644 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, 651 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
645 sys_data->exit_event->event.type, regs, &rctx); 652 sys_data->exit_event->event.type, regs, &rctx);
646 if (!rec) 653 if (!rec)
@@ -648,8 +655,6 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
648 655
649 rec->nr = syscall_nr; 656 rec->nr = syscall_nr;
650 rec->ret = syscall_get_return_value(current, regs); 657 rec->ret = syscall_get_return_value(current, regs);
651
652 head = this_cpu_ptr(sys_data->exit_event->perf_events);
653 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 658 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
654} 659}
655 660
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 32494fb0ee64..272261b5f94f 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -70,7 +70,7 @@ struct trace_uprobe {
70 (sizeof(struct probe_arg) * (n))) 70 (sizeof(struct probe_arg) * (n)))
71 71
72static int register_uprobe_event(struct trace_uprobe *tu); 72static int register_uprobe_event(struct trace_uprobe *tu);
73static void unregister_uprobe_event(struct trace_uprobe *tu); 73static int unregister_uprobe_event(struct trace_uprobe *tu);
74 74
75static DEFINE_MUTEX(uprobe_lock); 75static DEFINE_MUTEX(uprobe_lock);
76static LIST_HEAD(uprobe_list); 76static LIST_HEAD(uprobe_list);
@@ -164,11 +164,17 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou
164} 164}
165 165
166/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */ 166/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */
167static void unregister_trace_uprobe(struct trace_uprobe *tu) 167static int unregister_trace_uprobe(struct trace_uprobe *tu)
168{ 168{
169 int ret;
170
171 ret = unregister_uprobe_event(tu);
172 if (ret)
173 return ret;
174
169 list_del(&tu->list); 175 list_del(&tu->list);
170 unregister_uprobe_event(tu);
171 free_trace_uprobe(tu); 176 free_trace_uprobe(tu);
177 return 0;
172} 178}
173 179
174/* Register a trace_uprobe and probe_event */ 180/* Register a trace_uprobe and probe_event */
@@ -181,9 +187,12 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
181 187
182 /* register as an event */ 188 /* register as an event */
183 old_tp = find_probe_event(tu->call.name, tu->call.class->system); 189 old_tp = find_probe_event(tu->call.name, tu->call.class->system);
184 if (old_tp) 190 if (old_tp) {
185 /* delete old event */ 191 /* delete old event */
186 unregister_trace_uprobe(old_tp); 192 ret = unregister_trace_uprobe(old_tp);
193 if (ret)
194 goto end;
195 }
187 196
188 ret = register_uprobe_event(tu); 197 ret = register_uprobe_event(tu);
189 if (ret) { 198 if (ret) {
@@ -256,6 +265,8 @@ static int create_trace_uprobe(int argc, char **argv)
256 group = UPROBE_EVENT_SYSTEM; 265 group = UPROBE_EVENT_SYSTEM;
257 266
258 if (is_delete) { 267 if (is_delete) {
268 int ret;
269
259 if (!event) { 270 if (!event) {
260 pr_info("Delete command needs an event name.\n"); 271 pr_info("Delete command needs an event name.\n");
261 return -EINVAL; 272 return -EINVAL;
@@ -269,9 +280,9 @@ static int create_trace_uprobe(int argc, char **argv)
269 return -ENOENT; 280 return -ENOENT;
270 } 281 }
271 /* delete an event */ 282 /* delete an event */
272 unregister_trace_uprobe(tu); 283 ret = unregister_trace_uprobe(tu);
273 mutex_unlock(&uprobe_lock); 284 mutex_unlock(&uprobe_lock);
274 return 0; 285 return ret;
275 } 286 }
276 287
277 if (argc < 2) { 288 if (argc < 2) {
@@ -283,8 +294,10 @@ static int create_trace_uprobe(int argc, char **argv)
283 return -EINVAL; 294 return -EINVAL;
284 } 295 }
285 arg = strchr(argv[1], ':'); 296 arg = strchr(argv[1], ':');
286 if (!arg) 297 if (!arg) {
298 ret = -EINVAL;
287 goto fail_address_parse; 299 goto fail_address_parse;
300 }
288 301
289 *arg++ = '\0'; 302 *arg++ = '\0';
290 filename = argv[1]; 303 filename = argv[1];
@@ -406,16 +419,20 @@ fail_address_parse:
406 return ret; 419 return ret;
407} 420}
408 421
409static void cleanup_all_probes(void) 422static int cleanup_all_probes(void)
410{ 423{
411 struct trace_uprobe *tu; 424 struct trace_uprobe *tu;
425 int ret = 0;
412 426
413 mutex_lock(&uprobe_lock); 427 mutex_lock(&uprobe_lock);
414 while (!list_empty(&uprobe_list)) { 428 while (!list_empty(&uprobe_list)) {
415 tu = list_entry(uprobe_list.next, struct trace_uprobe, list); 429 tu = list_entry(uprobe_list.next, struct trace_uprobe, list);
416 unregister_trace_uprobe(tu); 430 ret = unregister_trace_uprobe(tu);
431 if (ret)
432 break;
417 } 433 }
418 mutex_unlock(&uprobe_lock); 434 mutex_unlock(&uprobe_lock);
435 return ret;
419} 436}
420 437
421/* Probes listing interfaces */ 438/* Probes listing interfaces */
@@ -460,8 +477,13 @@ static const struct seq_operations probes_seq_op = {
460 477
461static int probes_open(struct inode *inode, struct file *file) 478static int probes_open(struct inode *inode, struct file *file)
462{ 479{
463 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) 480 int ret;
464 cleanup_all_probes(); 481
482 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
483 ret = cleanup_all_probes();
484 if (ret)
485 return ret;
486 }
465 487
466 return seq_open(file, &probes_seq_op); 488 return seq_open(file, &probes_seq_op);
467} 489}
@@ -816,8 +838,6 @@ static void uprobe_perf_print(struct trace_uprobe *tu,
816 838
817 size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); 839 size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
818 size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); 840 size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32);
819 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
820 return;
821 841
822 preempt_disable(); 842 preempt_disable();
823 head = this_cpu_ptr(call->perf_events); 843 head = this_cpu_ptr(call->perf_events);
@@ -968,12 +988,17 @@ static int register_uprobe_event(struct trace_uprobe *tu)
968 return ret; 988 return ret;
969} 989}
970 990
971static void unregister_uprobe_event(struct trace_uprobe *tu) 991static int unregister_uprobe_event(struct trace_uprobe *tu)
972{ 992{
993 int ret;
994
973 /* tu->event is unregistered in trace_remove_event_call() */ 995 /* tu->event is unregistered in trace_remove_event_call() */
974 trace_remove_event_call(&tu->call); 996 ret = trace_remove_event_call(&tu->call);
997 if (ret)
998 return ret;
975 kfree(tu->call.print_fmt); 999 kfree(tu->call.print_fmt);
976 tu->call.print_fmt = NULL; 1000 tu->call.print_fmt = NULL;
1001 return 0;
977} 1002}
978 1003
979/* Make a trace interface for controling probe points */ 1004/* Make a trace interface for controling probe points */
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index d8c30db06c5b..9064b919a406 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -62,6 +62,9 @@ int create_user_ns(struct cred *new)
62 kgid_t group = new->egid; 62 kgid_t group = new->egid;
63 int ret; 63 int ret;
64 64
65 if (parent_ns->level > 32)
66 return -EUSERS;
67
65 /* 68 /*
66 * Verify that we can not violate the policy of which files 69 * Verify that we can not violate the policy of which files
67 * may be accessed that is specified by the root directory, 70 * may be accessed that is specified by the root directory,
@@ -92,6 +95,7 @@ int create_user_ns(struct cred *new)
92 atomic_set(&ns->count, 1); 95 atomic_set(&ns->count, 1);
93 /* Leave the new->user_ns reference with the new user namespace. */ 96 /* Leave the new->user_ns reference with the new user namespace. */
94 ns->parent = parent_ns; 97 ns->parent = parent_ns;
98 ns->level = parent_ns->level + 1;
95 ns->owner = owner; 99 ns->owner = owner;
96 ns->group = group; 100 ns->group = group;
97 101
@@ -105,16 +109,21 @@ int create_user_ns(struct cred *new)
105int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) 109int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
106{ 110{
107 struct cred *cred; 111 struct cred *cred;
112 int err = -ENOMEM;
108 113
109 if (!(unshare_flags & CLONE_NEWUSER)) 114 if (!(unshare_flags & CLONE_NEWUSER))
110 return 0; 115 return 0;
111 116
112 cred = prepare_creds(); 117 cred = prepare_creds();
113 if (!cred) 118 if (cred) {
114 return -ENOMEM; 119 err = create_user_ns(cred);
120 if (err)
121 put_cred(cred);
122 else
123 *new_cred = cred;
124 }
115 125
116 *new_cred = cred; 126 return err;
117 return create_user_ns(cred);
118} 127}
119 128
120void free_user_ns(struct user_namespace *ns) 129void free_user_ns(struct user_namespace *ns)
diff --git a/kernel/wait.c b/kernel/wait.c
index 6698e0c04ead..dec68bd4e9d8 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -287,3 +287,92 @@ wait_queue_head_t *bit_waitqueue(void *word, int bit)
287 return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; 287 return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
288} 288}
289EXPORT_SYMBOL(bit_waitqueue); 289EXPORT_SYMBOL(bit_waitqueue);
290
291/*
292 * Manipulate the atomic_t address to produce a better bit waitqueue table hash
293 * index (we're keying off bit -1, but that would produce a horrible hash
294 * value).
295 */
296static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
297{
298 if (BITS_PER_LONG == 64) {
299 unsigned long q = (unsigned long)p;
300 return bit_waitqueue((void *)(q & ~1), q & 1);
301 }
302 return bit_waitqueue(p, 0);
303}
304
305static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync,
306 void *arg)
307{
308 struct wait_bit_key *key = arg;
309 struct wait_bit_queue *wait_bit
310 = container_of(wait, struct wait_bit_queue, wait);
311 atomic_t *val = key->flags;
312
313 if (wait_bit->key.flags != key->flags ||
314 wait_bit->key.bit_nr != key->bit_nr ||
315 atomic_read(val) != 0)
316 return 0;
317 return autoremove_wake_function(wait, mode, sync, key);
318}
319
320/*
321 * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
322 * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero
323 * return codes halt waiting and return.
324 */
325static __sched
326int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q,
327 int (*action)(atomic_t *), unsigned mode)
328{
329 atomic_t *val;
330 int ret = 0;
331
332 do {
333 prepare_to_wait(wq, &q->wait, mode);
334 val = q->key.flags;
335 if (atomic_read(val) == 0)
336 break;
337 ret = (*action)(val);
338 } while (!ret && atomic_read(val) != 0);
339 finish_wait(wq, &q->wait);
340 return ret;
341}
342
343#define DEFINE_WAIT_ATOMIC_T(name, p) \
344 struct wait_bit_queue name = { \
345 .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \
346 .wait = { \
347 .private = current, \
348 .func = wake_atomic_t_function, \
349 .task_list = \
350 LIST_HEAD_INIT((name).wait.task_list), \
351 }, \
352 }
353
354__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
355 unsigned mode)
356{
357 wait_queue_head_t *wq = atomic_t_waitqueue(p);
358 DEFINE_WAIT_ATOMIC_T(wait, p);
359
360 return __wait_on_atomic_t(wq, &wait, action, mode);
361}
362EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
363
364/**
365 * wake_up_atomic_t - Wake up a waiter on a atomic_t
366 * @word: The word being waited on, a kernel virtual address
367 * @bit: The bit of the word being waited on
368 *
369 * Wake up anyone waiting for the atomic_t to go to zero.
370 *
371 * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
372 * check is done by the waiter's wake function, not the by the waker itself).
373 */
374void wake_up_atomic_t(atomic_t *p)
375{
376 __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
377}
378EXPORT_SYMBOL(wake_up_atomic_t);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 05039e348f07..1241d8c91d5e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -29,9 +29,9 @@
29#include <linux/kvm_para.h> 29#include <linux/kvm_para.h>
30#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31 31
32int watchdog_enabled = 1; 32int watchdog_user_enabled = 1;
33int __read_mostly watchdog_thresh = 10; 33int __read_mostly watchdog_thresh = 10;
34static int __read_mostly watchdog_disabled; 34static int __read_mostly watchdog_running;
35static u64 __read_mostly sample_period; 35static u64 __read_mostly sample_period;
36 36
37static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 37static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -63,7 +63,7 @@ static int __init hardlockup_panic_setup(char *str)
63 else if (!strncmp(str, "nopanic", 7)) 63 else if (!strncmp(str, "nopanic", 7))
64 hardlockup_panic = 0; 64 hardlockup_panic = 0;
65 else if (!strncmp(str, "0", 1)) 65 else if (!strncmp(str, "0", 1))
66 watchdog_enabled = 0; 66 watchdog_user_enabled = 0;
67 return 1; 67 return 1;
68} 68}
69__setup("nmi_watchdog=", hardlockup_panic_setup); 69__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -82,7 +82,7 @@ __setup("softlockup_panic=", softlockup_panic_setup);
82 82
83static int __init nowatchdog_setup(char *str) 83static int __init nowatchdog_setup(char *str)
84{ 84{
85 watchdog_enabled = 0; 85 watchdog_user_enabled = 0;
86 return 1; 86 return 1;
87} 87}
88__setup("nowatchdog", nowatchdog_setup); 88__setup("nowatchdog", nowatchdog_setup);
@@ -90,7 +90,7 @@ __setup("nowatchdog", nowatchdog_setup);
90/* deprecated */ 90/* deprecated */
91static int __init nosoftlockup_setup(char *str) 91static int __init nosoftlockup_setup(char *str)
92{ 92{
93 watchdog_enabled = 0; 93 watchdog_user_enabled = 0;
94 return 1; 94 return 1;
95} 95}
96__setup("nosoftlockup", nosoftlockup_setup); 96__setup("nosoftlockup", nosoftlockup_setup);
@@ -158,7 +158,7 @@ void touch_all_softlockup_watchdogs(void)
158#ifdef CONFIG_HARDLOCKUP_DETECTOR 158#ifdef CONFIG_HARDLOCKUP_DETECTOR
159void touch_nmi_watchdog(void) 159void touch_nmi_watchdog(void)
160{ 160{
161 if (watchdog_enabled) { 161 if (watchdog_user_enabled) {
162 unsigned cpu; 162 unsigned cpu;
163 163
164 for_each_present_cpu(cpu) { 164 for_each_present_cpu(cpu) {
@@ -347,11 +347,6 @@ static void watchdog_enable(unsigned int cpu)
347 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 347 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
348 hrtimer->function = watchdog_timer_fn; 348 hrtimer->function = watchdog_timer_fn;
349 349
350 if (!watchdog_enabled) {
351 kthread_park(current);
352 return;
353 }
354
355 /* Enable the perf event */ 350 /* Enable the perf event */
356 watchdog_nmi_enable(cpu); 351 watchdog_nmi_enable(cpu);
357 352
@@ -374,6 +369,11 @@ static void watchdog_disable(unsigned int cpu)
374 watchdog_nmi_disable(cpu); 369 watchdog_nmi_disable(cpu);
375} 370}
376 371
372static void watchdog_cleanup(unsigned int cpu, bool online)
373{
374 watchdog_disable(cpu);
375}
376
377static int watchdog_should_run(unsigned int cpu) 377static int watchdog_should_run(unsigned int cpu)
378{ 378{
379 return __this_cpu_read(hrtimer_interrupts) != 379 return __this_cpu_read(hrtimer_interrupts) !=
@@ -475,28 +475,40 @@ static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
475static void watchdog_nmi_disable(unsigned int cpu) { return; } 475static void watchdog_nmi_disable(unsigned int cpu) { return; }
476#endif /* CONFIG_HARDLOCKUP_DETECTOR */ 476#endif /* CONFIG_HARDLOCKUP_DETECTOR */
477 477
478/* prepare/enable/disable routines */ 478static struct smp_hotplug_thread watchdog_threads = {
479/* sysctl functions */ 479 .store = &softlockup_watchdog,
480#ifdef CONFIG_SYSCTL 480 .thread_should_run = watchdog_should_run,
481static void watchdog_enable_all_cpus(void) 481 .thread_fn = watchdog,
482 .thread_comm = "watchdog/%u",
483 .setup = watchdog_enable,
484 .cleanup = watchdog_cleanup,
485 .park = watchdog_disable,
486 .unpark = watchdog_enable,
487};
488
489static int watchdog_enable_all_cpus(void)
482{ 490{
483 unsigned int cpu; 491 int err = 0;
484 492
485 if (watchdog_disabled) { 493 if (!watchdog_running) {
486 watchdog_disabled = 0; 494 err = smpboot_register_percpu_thread(&watchdog_threads);
487 for_each_online_cpu(cpu) 495 if (err)
488 kthread_unpark(per_cpu(softlockup_watchdog, cpu)); 496 pr_err("Failed to create watchdog threads, disabled\n");
497 else
498 watchdog_running = 1;
489 } 499 }
500
501 return err;
490} 502}
491 503
504/* prepare/enable/disable routines */
505/* sysctl functions */
506#ifdef CONFIG_SYSCTL
492static void watchdog_disable_all_cpus(void) 507static void watchdog_disable_all_cpus(void)
493{ 508{
494 unsigned int cpu; 509 if (watchdog_running) {
495 510 watchdog_running = 0;
496 if (!watchdog_disabled) { 511 smpboot_unregister_percpu_thread(&watchdog_threads);
497 watchdog_disabled = 1;
498 for_each_online_cpu(cpu)
499 kthread_park(per_cpu(softlockup_watchdog, cpu));
500 } 512 }
501} 513}
502 514
@@ -507,45 +519,48 @@ static void watchdog_disable_all_cpus(void)
507int proc_dowatchdog(struct ctl_table *table, int write, 519int proc_dowatchdog(struct ctl_table *table, int write,
508 void __user *buffer, size_t *lenp, loff_t *ppos) 520 void __user *buffer, size_t *lenp, loff_t *ppos)
509{ 521{
510 int ret; 522 int err, old_thresh, old_enabled;
511 523
512 if (watchdog_disabled < 0) 524 old_thresh = ACCESS_ONCE(watchdog_thresh);
513 return -ENODEV; 525 old_enabled = ACCESS_ONCE(watchdog_user_enabled);
514 526
515 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 527 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
516 if (ret || !write) 528 if (err || !write)
517 return ret; 529 return err;
518 530
519 set_sample_period(); 531 set_sample_period();
520 /* 532 /*
521 * Watchdog threads shouldn't be enabled if they are 533 * Watchdog threads shouldn't be enabled if they are
522 * disabled. The 'watchdog_disabled' variable check in 534 * disabled. The 'watchdog_running' variable check in
523 * watchdog_*_all_cpus() function takes care of this. 535 * watchdog_*_all_cpus() function takes care of this.
524 */ 536 */
525 if (watchdog_enabled && watchdog_thresh) 537 if (watchdog_user_enabled && watchdog_thresh)
526 watchdog_enable_all_cpus(); 538 err = watchdog_enable_all_cpus();
527 else 539 else
528 watchdog_disable_all_cpus(); 540 watchdog_disable_all_cpus();
529 541
530 return ret; 542 /* Restore old values on failure */
543 if (err) {
544 watchdog_thresh = old_thresh;
545 watchdog_user_enabled = old_enabled;
546 }
547
548 return err;
531} 549}
532#endif /* CONFIG_SYSCTL */ 550#endif /* CONFIG_SYSCTL */
533 551
534static struct smp_hotplug_thread watchdog_threads = {
535 .store = &softlockup_watchdog,
536 .thread_should_run = watchdog_should_run,
537 .thread_fn = watchdog,
538 .thread_comm = "watchdog/%u",
539 .setup = watchdog_enable,
540 .park = watchdog_disable,
541 .unpark = watchdog_enable,
542};
543
544void __init lockup_detector_init(void) 552void __init lockup_detector_init(void)
545{ 553{
546 set_sample_period(); 554 set_sample_period();
547 if (smpboot_register_percpu_thread(&watchdog_threads)) { 555
548 pr_err("Failed to create watchdog threads, disabled\n"); 556#ifdef CONFIG_NO_HZ_FULL
549 watchdog_disabled = -ENODEV; 557 if (watchdog_user_enabled) {
558 watchdog_user_enabled = 0;
559 pr_warning("Disabled lockup detectors by default for full dynticks\n");
560 pr_warning("You can reactivate it with 'sysctl -w kernel.watchdog=1'\n");
550 } 561 }
562#endif
563
564 if (watchdog_user_enabled)
565 watchdog_enable_all_cpus();
551} 566}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ee8e29a2320c..7f5d4be22034 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -272,6 +272,15 @@ static cpumask_var_t *wq_numa_possible_cpumask;
272static bool wq_disable_numa; 272static bool wq_disable_numa;
273module_param_named(disable_numa, wq_disable_numa, bool, 0444); 273module_param_named(disable_numa, wq_disable_numa, bool, 0444);
274 274
275/* see the comment above the definition of WQ_POWER_EFFICIENT */
276#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT
277static bool wq_power_efficient = true;
278#else
279static bool wq_power_efficient;
280#endif
281
282module_param_named(power_efficient, wq_power_efficient, bool, 0444);
283
275static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ 284static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
276 285
277/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ 286/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
@@ -305,6 +314,10 @@ struct workqueue_struct *system_unbound_wq __read_mostly;
305EXPORT_SYMBOL_GPL(system_unbound_wq); 314EXPORT_SYMBOL_GPL(system_unbound_wq);
306struct workqueue_struct *system_freezable_wq __read_mostly; 315struct workqueue_struct *system_freezable_wq __read_mostly;
307EXPORT_SYMBOL_GPL(system_freezable_wq); 316EXPORT_SYMBOL_GPL(system_freezable_wq);
317struct workqueue_struct *system_power_efficient_wq __read_mostly;
318EXPORT_SYMBOL_GPL(system_power_efficient_wq);
319struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
320EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
308 321
309static int worker_thread(void *__worker); 322static int worker_thread(void *__worker);
310static void copy_workqueue_attrs(struct workqueue_attrs *to, 323static void copy_workqueue_attrs(struct workqueue_attrs *to,
@@ -2804,6 +2817,19 @@ already_gone:
2804 return false; 2817 return false;
2805} 2818}
2806 2819
2820static bool __flush_work(struct work_struct *work)
2821{
2822 struct wq_barrier barr;
2823
2824 if (start_flush_work(work, &barr)) {
2825 wait_for_completion(&barr.done);
2826 destroy_work_on_stack(&barr.work);
2827 return true;
2828 } else {
2829 return false;
2830 }
2831}
2832
2807/** 2833/**
2808 * flush_work - wait for a work to finish executing the last queueing instance 2834 * flush_work - wait for a work to finish executing the last queueing instance
2809 * @work: the work to flush 2835 * @work: the work to flush
@@ -2817,18 +2843,10 @@ already_gone:
2817 */ 2843 */
2818bool flush_work(struct work_struct *work) 2844bool flush_work(struct work_struct *work)
2819{ 2845{
2820 struct wq_barrier barr;
2821
2822 lock_map_acquire(&work->lockdep_map); 2846 lock_map_acquire(&work->lockdep_map);
2823 lock_map_release(&work->lockdep_map); 2847 lock_map_release(&work->lockdep_map);
2824 2848
2825 if (start_flush_work(work, &barr)) { 2849 return __flush_work(work);
2826 wait_for_completion(&barr.done);
2827 destroy_work_on_stack(&barr.work);
2828 return true;
2829 } else {
2830 return false;
2831 }
2832} 2850}
2833EXPORT_SYMBOL_GPL(flush_work); 2851EXPORT_SYMBOL_GPL(flush_work);
2834 2852
@@ -3398,6 +3416,12 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
3398{ 3416{
3399 to->nice = from->nice; 3417 to->nice = from->nice;
3400 cpumask_copy(to->cpumask, from->cpumask); 3418 cpumask_copy(to->cpumask, from->cpumask);
3419 /*
3420 * Unlike hash and equality test, this function doesn't ignore
3421 * ->no_numa as it is used for both pool and wq attrs. Instead,
3422 * get_unbound_pool() explicitly clears ->no_numa after copying.
3423 */
3424 to->no_numa = from->no_numa;
3401} 3425}
3402 3426
3403/* hash value of the content of @attr */ 3427/* hash value of the content of @attr */
@@ -3565,6 +3589,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
3565 lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ 3589 lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
3566 copy_workqueue_attrs(pool->attrs, attrs); 3590 copy_workqueue_attrs(pool->attrs, attrs);
3567 3591
3592 /*
3593 * no_numa isn't a worker_pool attribute, always clear it. See
3594 * 'struct workqueue_attrs' comments for detail.
3595 */
3596 pool->attrs->no_numa = false;
3597
3568 /* if cpumask is contained inside a NUMA node, we belong to that node */ 3598 /* if cpumask is contained inside a NUMA node, we belong to that node */
3569 if (wq_numa_enabled) { 3599 if (wq_numa_enabled) {
3570 for_each_node(node) { 3600 for_each_node(node) {
@@ -4086,6 +4116,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
4086 struct workqueue_struct *wq; 4116 struct workqueue_struct *wq;
4087 struct pool_workqueue *pwq; 4117 struct pool_workqueue *pwq;
4088 4118
4119 /* see the comment above the definition of WQ_POWER_EFFICIENT */
4120 if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
4121 flags |= WQ_UNBOUND;
4122
4089 /* allocate wq and format name */ 4123 /* allocate wq and format name */
4090 if (flags & WQ_UNBOUND) 4124 if (flags & WQ_UNBOUND)
4091 tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); 4125 tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
@@ -4627,7 +4661,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
4627 * Workqueues should be brought up before normal priority CPU notifiers. 4661 * Workqueues should be brought up before normal priority CPU notifiers.
4628 * This will be registered high priority CPU notifier. 4662 * This will be registered high priority CPU notifier.
4629 */ 4663 */
4630static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, 4664static int workqueue_cpu_up_callback(struct notifier_block *nfb,
4631 unsigned long action, 4665 unsigned long action,
4632 void *hcpu) 4666 void *hcpu)
4633{ 4667{
@@ -4680,7 +4714,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
4680 * Workqueues should be brought down after normal priority CPU notifiers. 4714 * Workqueues should be brought down after normal priority CPU notifiers.
4681 * This will be registered as low priority CPU notifier. 4715 * This will be registered as low priority CPU notifier.
4682 */ 4716 */
4683static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, 4717static int workqueue_cpu_down_callback(struct notifier_block *nfb,
4684 unsigned long action, 4718 unsigned long action,
4685 void *hcpu) 4719 void *hcpu)
4686{ 4720{
@@ -4739,7 +4773,14 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
4739 4773
4740 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); 4774 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
4741 schedule_work_on(cpu, &wfc.work); 4775 schedule_work_on(cpu, &wfc.work);
4742 flush_work(&wfc.work); 4776
4777 /*
4778 * The work item is on-stack and can't lead to deadlock through
4779 * flushing. Use __flush_work() to avoid spurious lockdep warnings
4780 * when work_on_cpu()s are nested.
4781 */
4782 __flush_work(&wfc.work);
4783
4743 return wfc.ret; 4784 return wfc.ret;
4744} 4785}
4745EXPORT_SYMBOL_GPL(work_on_cpu); 4786EXPORT_SYMBOL_GPL(work_on_cpu);
@@ -4985,8 +5026,15 @@ static int __init init_workqueues(void)
4985 WQ_UNBOUND_MAX_ACTIVE); 5026 WQ_UNBOUND_MAX_ACTIVE);
4986 system_freezable_wq = alloc_workqueue("events_freezable", 5027 system_freezable_wq = alloc_workqueue("events_freezable",
4987 WQ_FREEZABLE, 0); 5028 WQ_FREEZABLE, 0);
5029 system_power_efficient_wq = alloc_workqueue("events_power_efficient",
5030 WQ_POWER_EFFICIENT, 0);
5031 system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
5032 WQ_FREEZABLE | WQ_POWER_EFFICIENT,
5033 0);
4988 BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || 5034 BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
4989 !system_unbound_wq || !system_freezable_wq); 5035 !system_unbound_wq || !system_freezable_wq ||
5036 !system_power_efficient_wq ||
5037 !system_freezable_power_efficient_wq);
4990 return 0; 5038 return 0;
4991} 5039}
4992early_initcall(init_workqueues); 5040early_initcall(init_workqueues);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index ad83c96b2ece..7e2204db0b1a 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -64,7 +64,7 @@ static inline struct worker *current_wq_worker(void)
64 64
65/* 65/*
66 * Scheduler hooks for concurrency managed workqueue. Only to be used from 66 * Scheduler hooks for concurrency managed workqueue. Only to be used from
67 * sched.c and workqueue.c. 67 * sched/core.c and workqueue.c.
68 */ 68 */
69void wq_worker_waking_up(struct task_struct *task, int cpu); 69void wq_worker_waking_up(struct task_struct *task, int cpu);
70struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); 70struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);