summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2013-07-12 06:34:42 -0400
committerThomas Gleixner <tglx@linutronix.de>2013-07-12 06:34:42 -0400
commitf2006e27396f55276f24434f56e208d86e7f9908 (patch)
tree71896db916d33888b4286f80117d3cac0da40e6d /kernel
parente399eb56a6110e13f97e644658648602e2b08de7 (diff)
parent9903883f1dd6e86f286b7bfa6e4b423f98c1cd9e (diff)
Merge branch 'linus' into timers/urgent
Get upstream changes so we can apply fixes against them Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit.h1
-rw-r--r--kernel/auditfilter.c8
-rw-r--r--kernel/auditsc.c12
-rw-r--r--kernel/cgroup.c1536
-rw-r--r--kernel/cpuset.c478
-rw-r--r--kernel/events/core.c4
-rw-r--r--kernel/exit.c15
-rw-r--r--kernel/fork.c70
-rw-r--r--kernel/freezer.c12
-rw-r--r--kernel/hrtimer.c35
-rw-r--r--kernel/irq/generic-chip.c8
-rw-r--r--kernel/irq/irqdomain.c579
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/kmod.c11
-rw-r--r--kernel/kprobes.c3
-rw-r--r--kernel/lockdep.c17
-rw-r--r--kernel/module.c77
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/params.c2
-rw-r--r--kernel/pid.c14
-rw-r--r--kernel/posix-cpu-timers.c395
-rw-r--r--kernel/power/Kconfig20
-rw-r--r--kernel/power/main.c6
-rw-r--r--kernel/power/process.c26
-rw-r--r--kernel/power/qos.c14
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/suspend.c2
-rw-r--r--kernel/printk.c2
-rw-r--r--kernel/ptrace.c61
-rw-r--r--kernel/rcutree.c2
-rw-r--r--kernel/reboot.c419
-rw-r--r--kernel/resource.c1
-rw-r--r--kernel/sched/stats.h39
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/sys.c352
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/alarmtimer.c47
-rw-r--r--kernel/time/clockevents.c271
-rw-r--r--kernel/time/clocksource.c266
-rw-r--r--kernel/time/sched_clock.c212
-rw-r--r--kernel/time/tick-broadcast.c126
-rw-r--r--kernel/time/tick-common.c197
-rw-r--r--kernel/time/tick-internal.h17
-rw-r--r--kernel/time/timekeeping.c65
-rw-r--r--kernel/time/timekeeping_debug.c72
-rw-r--r--kernel/time/timekeeping_internal.h14
-rw-r--r--kernel/timer.c8
-rw-r--r--kernel/trace/ftrace.c38
-rw-r--r--kernel/trace/trace.c338
-rw-r--r--kernel/trace/trace.h17
-rw-r--r--kernel/trace/trace_events.c166
-rw-r--r--kernel/trace/trace_events_filter.c6
-rw-r--r--kernel/trace/trace_functions.c103
-rw-r--r--kernel/trace/trace_irqsoff.c4
-rw-r--r--kernel/trace/trace_kprobe.c190
-rw-r--r--kernel/trace/trace_selftest.c18
-rw-r--r--kernel/trace/trace_syscalls.c21
-rw-r--r--kernel/trace/trace_uprobe.c4
-rw-r--r--kernel/workqueue.c26
62 files changed, 3927 insertions, 2553 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 271fd3119af9..470839d1a30e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o cred.o \ 12 notifier.o ksysfs.o cred.o reboot.o \
13 async.o range.o groups.o lglock.o smpboot.o 13 async.o range.o groups.o lglock.o smpboot.o
14 14
15ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
diff --git a/kernel/audit.h b/kernel/audit.h
index 1c95131ef760..123c9b7c3979 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -85,6 +85,7 @@ struct audit_names {
85 85
86 struct filename *name; 86 struct filename *name;
87 int name_len; /* number of chars to log */ 87 int name_len; /* number of chars to log */
88 bool hidden; /* don't log this record */
88 bool name_put; /* call __putname()? */ 89 bool name_put; /* call __putname()? */
89 90
90 unsigned long ino; 91 unsigned long ino;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 6bd4a90d1991..f7aee8be7fb2 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -423,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
423 f->lsm_rule = NULL; 423 f->lsm_rule = NULL;
424 424
425 /* Support legacy tests for a valid loginuid */ 425 /* Support legacy tests for a valid loginuid */
426 if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) { 426 if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) {
427 f->type = AUDIT_LOGINUID_SET; 427 f->type = AUDIT_LOGINUID_SET;
428 f->val = 0; 428 f->val = 0;
429 } 429 }
@@ -865,6 +865,12 @@ static inline int audit_add_rule(struct audit_entry *entry)
865 err = audit_add_watch(&entry->rule, &list); 865 err = audit_add_watch(&entry->rule, &list);
866 if (err) { 866 if (err) {
867 mutex_unlock(&audit_filter_mutex); 867 mutex_unlock(&audit_filter_mutex);
868 /*
869 * normally audit_add_tree_rule() will free it
870 * on failure
871 */
872 if (tree)
873 audit_put_tree(tree);
868 goto error; 874 goto error;
869 } 875 }
870 } 876 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3c8a601324a2..9845cb32b60a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1399,8 +1399,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1399 } 1399 }
1400 1400
1401 i = 0; 1401 i = 0;
1402 list_for_each_entry(n, &context->names_list, list) 1402 list_for_each_entry(n, &context->names_list, list) {
1403 if (n->hidden)
1404 continue;
1403 audit_log_name(context, n, NULL, i++, &call_panic); 1405 audit_log_name(context, n, NULL, i++, &call_panic);
1406 }
1404 1407
1405 /* Send end of event record to help user space know we are finished */ 1408 /* Send end of event record to help user space know we are finished */
1406 ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); 1409 ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1769,14 +1772,15 @@ void audit_putname(struct filename *name)
1769 * __audit_inode - store the inode and device from a lookup 1772 * __audit_inode - store the inode and device from a lookup
1770 * @name: name being audited 1773 * @name: name being audited
1771 * @dentry: dentry being audited 1774 * @dentry: dentry being audited
1772 * @parent: does this dentry represent the parent? 1775 * @flags: attributes for this particular entry
1773 */ 1776 */
1774void __audit_inode(struct filename *name, const struct dentry *dentry, 1777void __audit_inode(struct filename *name, const struct dentry *dentry,
1775 unsigned int parent) 1778 unsigned int flags)
1776{ 1779{
1777 struct audit_context *context = current->audit_context; 1780 struct audit_context *context = current->audit_context;
1778 const struct inode *inode = dentry->d_inode; 1781 const struct inode *inode = dentry->d_inode;
1779 struct audit_names *n; 1782 struct audit_names *n;
1783 bool parent = flags & AUDIT_INODE_PARENT;
1780 1784
1781 if (!context->in_syscall) 1785 if (!context->in_syscall)
1782 return; 1786 return;
@@ -1831,6 +1835,8 @@ out:
1831 if (parent) { 1835 if (parent) {
1832 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; 1836 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
1833 n->type = AUDIT_TYPE_PARENT; 1837 n->type = AUDIT_TYPE_PARENT;
1838 if (flags & AUDIT_INODE_HIDDEN)
1839 n->hidden = true;
1834 } else { 1840 } else {
1835 n->name_len = AUDIT_NAME_FULL; 1841 n->name_len = AUDIT_NAME_FULL;
1836 n->type = AUDIT_TYPE_NORMAL; 1842 n->type = AUDIT_TYPE_NORMAL;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a7c9e6ddb979..e5583d10a325 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,9 +63,6 @@
63 63
64#include <linux/atomic.h> 64#include <linux/atomic.h>
65 65
66/* css deactivation bias, makes css->refcnt negative to deny new trygets */
67#define CSS_DEACT_BIAS INT_MIN
68
69/* 66/*
70 * cgroup_mutex is the master lock. Any modification to cgroup or its 67 * cgroup_mutex is the master lock. Any modification to cgroup or its
71 * hierarchy must be performed while holding it. 68 * hierarchy must be performed while holding it.
@@ -99,16 +96,19 @@ static DEFINE_MUTEX(cgroup_root_mutex);
99 */ 96 */
100#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, 97#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
101#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) 98#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
102static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { 99static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = {
103#include <linux/cgroup_subsys.h> 100#include <linux/cgroup_subsys.h>
104}; 101};
105 102
106/* 103/*
107 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the 104 * The dummy hierarchy, reserved for the subsystems that are otherwise
108 * subsystems that are otherwise unattached - it never has more than a 105 * unattached - it never has more than a single cgroup, and all tasks are
109 * single cgroup, and all tasks are part of that cgroup. 106 * part of that cgroup.
110 */ 107 */
111static struct cgroupfs_root rootnode; 108static struct cgroupfs_root cgroup_dummy_root;
109
110/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
111static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
112 112
113/* 113/*
114 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. 114 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
@@ -186,18 +186,28 @@ struct cgroup_event {
186 186
187/* The list of hierarchy roots */ 187/* The list of hierarchy roots */
188 188
189static LIST_HEAD(roots); 189static LIST_HEAD(cgroup_roots);
190static int root_count; 190static int cgroup_root_count;
191 191
192static DEFINE_IDA(hierarchy_ida); 192/*
193static int next_hierarchy_id; 193 * Hierarchy ID allocation and mapping. It follows the same exclusion
194static DEFINE_SPINLOCK(hierarchy_id_lock); 194 * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
195 195 * writes, either for reads.
196/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 196 */
197#define dummytop (&rootnode.top_cgroup) 197static DEFINE_IDR(cgroup_hierarchy_idr);
198 198
199static struct cgroup_name root_cgroup_name = { .name = "/" }; 199static struct cgroup_name root_cgroup_name = { .name = "/" };
200 200
201/*
202 * Assign a monotonically increasing serial number to cgroups. It
203 * guarantees cgroups with bigger numbers are newer than those with smaller
204 * numbers. Also, as cgroups are always appended to the parent's
205 * ->children list, it guarantees that sibling cgroups are always sorted in
206 * the ascending serial number order on the list. Protected by
207 * cgroup_mutex.
208 */
209static u64 cgroup_serial_nr_next = 1;
210
201/* This flag indicates whether tasks in the fork and exit paths should 211/* This flag indicates whether tasks in the fork and exit paths should
202 * check for fork/exit handlers to call. This avoids us having to do 212 * check for fork/exit handlers to call. This avoids us having to do
203 * extra work in the fork/exit path if none of the subsystems need to 213 * extra work in the fork/exit path if none of the subsystems need to
@@ -205,27 +215,15 @@ static struct cgroup_name root_cgroup_name = { .name = "/" };
205 */ 215 */
206static int need_forkexit_callback __read_mostly; 216static int need_forkexit_callback __read_mostly;
207 217
218static void cgroup_offline_fn(struct work_struct *work);
208static int cgroup_destroy_locked(struct cgroup *cgrp); 219static int cgroup_destroy_locked(struct cgroup *cgrp);
209static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 220static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
210 struct cftype cfts[], bool is_add); 221 struct cftype cfts[], bool is_add);
211 222
212static int css_unbias_refcnt(int refcnt)
213{
214 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
215}
216
217/* the current nr of refs, always >= 0 whether @css is deactivated or not */
218static int css_refcnt(struct cgroup_subsys_state *css)
219{
220 int v = atomic_read(&css->refcnt);
221
222 return css_unbias_refcnt(v);
223}
224
225/* convenient tests for these bits */ 223/* convenient tests for these bits */
226inline int cgroup_is_removed(const struct cgroup *cgrp) 224static inline bool cgroup_is_dead(const struct cgroup *cgrp)
227{ 225{
228 return test_bit(CGRP_REMOVED, &cgrp->flags); 226 return test_bit(CGRP_DEAD, &cgrp->flags);
229} 227}
230 228
231/** 229/**
@@ -261,16 +259,38 @@ static int notify_on_release(const struct cgroup *cgrp)
261 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 259 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
262} 260}
263 261
264/* 262/**
265 * for_each_subsys() allows you to iterate on each subsystem attached to 263 * for_each_subsys - iterate all loaded cgroup subsystems
266 * an active hierarchy 264 * @ss: the iteration cursor
265 * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
266 *
267 * Should be called under cgroup_mutex.
267 */ 268 */
268#define for_each_subsys(_root, _ss) \ 269#define for_each_subsys(ss, i) \
269list_for_each_entry(_ss, &_root->subsys_list, sibling) 270 for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \
271 if (({ lockdep_assert_held(&cgroup_mutex); \
272 !((ss) = cgroup_subsys[i]); })) { } \
273 else
274
275/**
276 * for_each_builtin_subsys - iterate all built-in cgroup subsystems
277 * @ss: the iteration cursor
278 * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
279 *
280 * Bulit-in subsystems are always present and iteration itself doesn't
281 * require any synchronization.
282 */
283#define for_each_builtin_subsys(ss, i) \
284 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
285 (((ss) = cgroup_subsys[i]) || true); (i)++)
286
287/* iterate each subsystem attached to a hierarchy */
288#define for_each_root_subsys(root, ss) \
289 list_for_each_entry((ss), &(root)->subsys_list, sibling)
270 290
271/* for_each_active_root() allows you to iterate across the active hierarchies */ 291/* iterate across the active hierarchies */
272#define for_each_active_root(_root) \ 292#define for_each_active_root(root) \
273list_for_each_entry(_root, &roots, root_list) 293 list_for_each_entry((root), &cgroup_roots, root_list)
274 294
275static inline struct cgroup *__d_cgrp(struct dentry *dentry) 295static inline struct cgroup *__d_cgrp(struct dentry *dentry)
276{ 296{
@@ -297,7 +317,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
297static bool cgroup_lock_live_group(struct cgroup *cgrp) 317static bool cgroup_lock_live_group(struct cgroup *cgrp)
298{ 318{
299 mutex_lock(&cgroup_mutex); 319 mutex_lock(&cgroup_mutex);
300 if (cgroup_is_removed(cgrp)) { 320 if (cgroup_is_dead(cgrp)) {
301 mutex_unlock(&cgroup_mutex); 321 mutex_unlock(&cgroup_mutex);
302 return false; 322 return false;
303 } 323 }
@@ -312,20 +332,24 @@ static void cgroup_release_agent(struct work_struct *work);
312static DECLARE_WORK(release_agent_work, cgroup_release_agent); 332static DECLARE_WORK(release_agent_work, cgroup_release_agent);
313static void check_for_release(struct cgroup *cgrp); 333static void check_for_release(struct cgroup *cgrp);
314 334
315/* Link structure for associating css_set objects with cgroups */ 335/*
316struct cg_cgroup_link { 336 * A cgroup can be associated with multiple css_sets as different tasks may
317 /* 337 * belong to different cgroups on different hierarchies. In the other
318 * List running through cg_cgroup_links associated with a 338 * direction, a css_set is naturally associated with multiple cgroups.
319 * cgroup, anchored on cgroup->css_sets 339 * This M:N relationship is represented by the following link structure
320 */ 340 * which exists for each association and allows traversing the associations
321 struct list_head cgrp_link_list; 341 * from both sides.
322 struct cgroup *cgrp; 342 */
323 /* 343struct cgrp_cset_link {
324 * List running through cg_cgroup_links pointing at a 344 /* the cgroup and css_set this link associates */
325 * single css_set object, anchored on css_set->cg_links 345 struct cgroup *cgrp;
326 */ 346 struct css_set *cset;
327 struct list_head cg_link_list; 347
328 struct css_set *cg; 348 /* list of cgrp_cset_links anchored at cgrp->cset_links */
349 struct list_head cset_link;
350
351 /* list of cgrp_cset_links anchored at css_set->cgrp_links */
352 struct list_head cgrp_link;
329}; 353};
330 354
331/* The default css_set - used by init and its children prior to any 355/* The default css_set - used by init and its children prior to any
@@ -336,7 +360,7 @@ struct cg_cgroup_link {
336 */ 360 */
337 361
338static struct css_set init_css_set; 362static struct css_set init_css_set;
339static struct cg_cgroup_link init_css_set_link; 363static struct cgrp_cset_link init_cgrp_cset_link;
340 364
341static int cgroup_init_idr(struct cgroup_subsys *ss, 365static int cgroup_init_idr(struct cgroup_subsys *ss,
342 struct cgroup_subsys_state *css); 366 struct cgroup_subsys_state *css);
@@ -357,10 +381,11 @@ static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
357 381
358static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) 382static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
359{ 383{
360 int i;
361 unsigned long key = 0UL; 384 unsigned long key = 0UL;
385 struct cgroup_subsys *ss;
386 int i;
362 387
363 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) 388 for_each_subsys(ss, i)
364 key += (unsigned long)css[i]; 389 key += (unsigned long)css[i];
365 key = (key >> 16) ^ key; 390 key = (key >> 16) ^ key;
366 391
@@ -373,90 +398,83 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
373 * compiled into their kernel but not actually in use */ 398 * compiled into their kernel but not actually in use */
374static int use_task_css_set_links __read_mostly; 399static int use_task_css_set_links __read_mostly;
375 400
376static void __put_css_set(struct css_set *cg, int taskexit) 401static void __put_css_set(struct css_set *cset, int taskexit)
377{ 402{
378 struct cg_cgroup_link *link; 403 struct cgrp_cset_link *link, *tmp_link;
379 struct cg_cgroup_link *saved_link; 404
380 /* 405 /*
381 * Ensure that the refcount doesn't hit zero while any readers 406 * Ensure that the refcount doesn't hit zero while any readers
382 * can see it. Similar to atomic_dec_and_lock(), but for an 407 * can see it. Similar to atomic_dec_and_lock(), but for an
383 * rwlock 408 * rwlock
384 */ 409 */
385 if (atomic_add_unless(&cg->refcount, -1, 1)) 410 if (atomic_add_unless(&cset->refcount, -1, 1))
386 return; 411 return;
387 write_lock(&css_set_lock); 412 write_lock(&css_set_lock);
388 if (!atomic_dec_and_test(&cg->refcount)) { 413 if (!atomic_dec_and_test(&cset->refcount)) {
389 write_unlock(&css_set_lock); 414 write_unlock(&css_set_lock);
390 return; 415 return;
391 } 416 }
392 417
393 /* This css_set is dead. unlink it and release cgroup refcounts */ 418 /* This css_set is dead. unlink it and release cgroup refcounts */
394 hash_del(&cg->hlist); 419 hash_del(&cset->hlist);
395 css_set_count--; 420 css_set_count--;
396 421
397 list_for_each_entry_safe(link, saved_link, &cg->cg_links, 422 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
398 cg_link_list) {
399 struct cgroup *cgrp = link->cgrp; 423 struct cgroup *cgrp = link->cgrp;
400 list_del(&link->cg_link_list);
401 list_del(&link->cgrp_link_list);
402 424
403 /* 425 list_del(&link->cset_link);
404 * We may not be holding cgroup_mutex, and if cgrp->count is 426 list_del(&link->cgrp_link);
405 * dropped to 0 the cgroup can be destroyed at any time, hence 427
406 * rcu_read_lock is used to keep it alive. 428 /* @cgrp can't go away while we're holding css_set_lock */
407 */ 429 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
408 rcu_read_lock();
409 if (atomic_dec_and_test(&cgrp->count) &&
410 notify_on_release(cgrp)) {
411 if (taskexit) 430 if (taskexit)
412 set_bit(CGRP_RELEASABLE, &cgrp->flags); 431 set_bit(CGRP_RELEASABLE, &cgrp->flags);
413 check_for_release(cgrp); 432 check_for_release(cgrp);
414 } 433 }
415 rcu_read_unlock();
416 434
417 kfree(link); 435 kfree(link);
418 } 436 }
419 437
420 write_unlock(&css_set_lock); 438 write_unlock(&css_set_lock);
421 kfree_rcu(cg, rcu_head); 439 kfree_rcu(cset, rcu_head);
422} 440}
423 441
424/* 442/*
425 * refcounted get/put for css_set objects 443 * refcounted get/put for css_set objects
426 */ 444 */
427static inline void get_css_set(struct css_set *cg) 445static inline void get_css_set(struct css_set *cset)
428{ 446{
429 atomic_inc(&cg->refcount); 447 atomic_inc(&cset->refcount);
430} 448}
431 449
432static inline void put_css_set(struct css_set *cg) 450static inline void put_css_set(struct css_set *cset)
433{ 451{
434 __put_css_set(cg, 0); 452 __put_css_set(cset, 0);
435} 453}
436 454
437static inline void put_css_set_taskexit(struct css_set *cg) 455static inline void put_css_set_taskexit(struct css_set *cset)
438{ 456{
439 __put_css_set(cg, 1); 457 __put_css_set(cset, 1);
440} 458}
441 459
442/* 460/**
443 * compare_css_sets - helper function for find_existing_css_set(). 461 * compare_css_sets - helper function for find_existing_css_set().
444 * @cg: candidate css_set being tested 462 * @cset: candidate css_set being tested
445 * @old_cg: existing css_set for a task 463 * @old_cset: existing css_set for a task
446 * @new_cgrp: cgroup that's being entered by the task 464 * @new_cgrp: cgroup that's being entered by the task
447 * @template: desired set of css pointers in css_set (pre-calculated) 465 * @template: desired set of css pointers in css_set (pre-calculated)
448 * 466 *
449 * Returns true if "cg" matches "old_cg" except for the hierarchy 467 * Returns true if "cg" matches "old_cg" except for the hierarchy
450 * which "new_cgrp" belongs to, for which it should match "new_cgrp". 468 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
451 */ 469 */
452static bool compare_css_sets(struct css_set *cg, 470static bool compare_css_sets(struct css_set *cset,
453 struct css_set *old_cg, 471 struct css_set *old_cset,
454 struct cgroup *new_cgrp, 472 struct cgroup *new_cgrp,
455 struct cgroup_subsys_state *template[]) 473 struct cgroup_subsys_state *template[])
456{ 474{
457 struct list_head *l1, *l2; 475 struct list_head *l1, *l2;
458 476
459 if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { 477 if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
460 /* Not all subsystems matched */ 478 /* Not all subsystems matched */
461 return false; 479 return false;
462 } 480 }
@@ -470,28 +488,28 @@ static bool compare_css_sets(struct css_set *cg,
470 * candidates. 488 * candidates.
471 */ 489 */
472 490
473 l1 = &cg->cg_links; 491 l1 = &cset->cgrp_links;
474 l2 = &old_cg->cg_links; 492 l2 = &old_cset->cgrp_links;
475 while (1) { 493 while (1) {
476 struct cg_cgroup_link *cgl1, *cgl2; 494 struct cgrp_cset_link *link1, *link2;
477 struct cgroup *cg1, *cg2; 495 struct cgroup *cgrp1, *cgrp2;
478 496
479 l1 = l1->next; 497 l1 = l1->next;
480 l2 = l2->next; 498 l2 = l2->next;
481 /* See if we reached the end - both lists are equal length. */ 499 /* See if we reached the end - both lists are equal length. */
482 if (l1 == &cg->cg_links) { 500 if (l1 == &cset->cgrp_links) {
483 BUG_ON(l2 != &old_cg->cg_links); 501 BUG_ON(l2 != &old_cset->cgrp_links);
484 break; 502 break;
485 } else { 503 } else {
486 BUG_ON(l2 == &old_cg->cg_links); 504 BUG_ON(l2 == &old_cset->cgrp_links);
487 } 505 }
488 /* Locate the cgroups associated with these links. */ 506 /* Locate the cgroups associated with these links. */
489 cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); 507 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
490 cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); 508 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
491 cg1 = cgl1->cgrp; 509 cgrp1 = link1->cgrp;
492 cg2 = cgl2->cgrp; 510 cgrp2 = link2->cgrp;
493 /* Hierarchies should be linked in the same order. */ 511 /* Hierarchies should be linked in the same order. */
494 BUG_ON(cg1->root != cg2->root); 512 BUG_ON(cgrp1->root != cgrp2->root);
495 513
496 /* 514 /*
497 * If this hierarchy is the hierarchy of the cgroup 515 * If this hierarchy is the hierarchy of the cgroup
@@ -500,46 +518,39 @@ static bool compare_css_sets(struct css_set *cg,
500 * hierarchy, then this css_set should point to the 518 * hierarchy, then this css_set should point to the
501 * same cgroup as the old css_set. 519 * same cgroup as the old css_set.
502 */ 520 */
503 if (cg1->root == new_cgrp->root) { 521 if (cgrp1->root == new_cgrp->root) {
504 if (cg1 != new_cgrp) 522 if (cgrp1 != new_cgrp)
505 return false; 523 return false;
506 } else { 524 } else {
507 if (cg1 != cg2) 525 if (cgrp1 != cgrp2)
508 return false; 526 return false;
509 } 527 }
510 } 528 }
511 return true; 529 return true;
512} 530}
513 531
514/* 532/**
515 * find_existing_css_set() is a helper for 533 * find_existing_css_set - init css array and find the matching css_set
516 * find_css_set(), and checks to see whether an existing 534 * @old_cset: the css_set that we're using before the cgroup transition
517 * css_set is suitable. 535 * @cgrp: the cgroup that we're moving into
518 * 536 * @template: out param for the new set of csses, should be clear on entry
519 * oldcg: the cgroup group that we're using before the cgroup
520 * transition
521 *
522 * cgrp: the cgroup that we're moving into
523 *
524 * template: location in which to build the desired set of subsystem
525 * state objects for the new cgroup group
526 */ 537 */
527static struct css_set *find_existing_css_set( 538static struct css_set *find_existing_css_set(struct css_set *old_cset,
528 struct css_set *oldcg, 539 struct cgroup *cgrp,
529 struct cgroup *cgrp, 540 struct cgroup_subsys_state *template[])
530 struct cgroup_subsys_state *template[])
531{ 541{
532 int i;
533 struct cgroupfs_root *root = cgrp->root; 542 struct cgroupfs_root *root = cgrp->root;
534 struct css_set *cg; 543 struct cgroup_subsys *ss;
544 struct css_set *cset;
535 unsigned long key; 545 unsigned long key;
546 int i;
536 547
537 /* 548 /*
538 * Build the set of subsystem state objects that we want to see in the 549 * Build the set of subsystem state objects that we want to see in the
539 * new css_set. while subsystems can change globally, the entries here 550 * new css_set. while subsystems can change globally, the entries here
540 * won't change, so no need for locking. 551 * won't change, so no need for locking.
541 */ 552 */
542 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 553 for_each_subsys(ss, i) {
543 if (root->subsys_mask & (1UL << i)) { 554 if (root->subsys_mask & (1UL << i)) {
544 /* Subsystem is in this hierarchy. So we want 555 /* Subsystem is in this hierarchy. So we want
545 * the subsystem state from the new 556 * the subsystem state from the new
@@ -548,148 +559,152 @@ static struct css_set *find_existing_css_set(
548 } else { 559 } else {
549 /* Subsystem is not in this hierarchy, so we 560 /* Subsystem is not in this hierarchy, so we
550 * don't want to change the subsystem state */ 561 * don't want to change the subsystem state */
551 template[i] = oldcg->subsys[i]; 562 template[i] = old_cset->subsys[i];
552 } 563 }
553 } 564 }
554 565
555 key = css_set_hash(template); 566 key = css_set_hash(template);
556 hash_for_each_possible(css_set_table, cg, hlist, key) { 567 hash_for_each_possible(css_set_table, cset, hlist, key) {
557 if (!compare_css_sets(cg, oldcg, cgrp, template)) 568 if (!compare_css_sets(cset, old_cset, cgrp, template))
558 continue; 569 continue;
559 570
560 /* This css_set matches what we need */ 571 /* This css_set matches what we need */
561 return cg; 572 return cset;
562 } 573 }
563 574
564 /* No existing cgroup group matched */ 575 /* No existing cgroup group matched */
565 return NULL; 576 return NULL;
566} 577}
567 578
568static void free_cg_links(struct list_head *tmp) 579static void free_cgrp_cset_links(struct list_head *links_to_free)
569{ 580{
570 struct cg_cgroup_link *link; 581 struct cgrp_cset_link *link, *tmp_link;
571 struct cg_cgroup_link *saved_link;
572 582
573 list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { 583 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
574 list_del(&link->cgrp_link_list); 584 list_del(&link->cset_link);
575 kfree(link); 585 kfree(link);
576 } 586 }
577} 587}
578 588
579/* 589/**
580 * allocate_cg_links() allocates "count" cg_cgroup_link structures 590 * allocate_cgrp_cset_links - allocate cgrp_cset_links
581 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on 591 * @count: the number of links to allocate
582 * success or a negative error 592 * @tmp_links: list_head the allocated links are put on
593 *
594 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
595 * through ->cset_link. Returns 0 on success or -errno.
583 */ 596 */
584static int allocate_cg_links(int count, struct list_head *tmp) 597static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
585{ 598{
586 struct cg_cgroup_link *link; 599 struct cgrp_cset_link *link;
587 int i; 600 int i;
588 INIT_LIST_HEAD(tmp); 601
602 INIT_LIST_HEAD(tmp_links);
603
589 for (i = 0; i < count; i++) { 604 for (i = 0; i < count; i++) {
590 link = kmalloc(sizeof(*link), GFP_KERNEL); 605 link = kzalloc(sizeof(*link), GFP_KERNEL);
591 if (!link) { 606 if (!link) {
592 free_cg_links(tmp); 607 free_cgrp_cset_links(tmp_links);
593 return -ENOMEM; 608 return -ENOMEM;
594 } 609 }
595 list_add(&link->cgrp_link_list, tmp); 610 list_add(&link->cset_link, tmp_links);
596 } 611 }
597 return 0; 612 return 0;
598} 613}
599 614
600/** 615/**
601 * link_css_set - a helper function to link a css_set to a cgroup 616 * link_css_set - a helper function to link a css_set to a cgroup
602 * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() 617 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
603 * @cg: the css_set to be linked 618 * @cset: the css_set to be linked
604 * @cgrp: the destination cgroup 619 * @cgrp: the destination cgroup
605 */ 620 */
606static void link_css_set(struct list_head *tmp_cg_links, 621static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
607 struct css_set *cg, struct cgroup *cgrp) 622 struct cgroup *cgrp)
608{ 623{
609 struct cg_cgroup_link *link; 624 struct cgrp_cset_link *link;
610 625
611 BUG_ON(list_empty(tmp_cg_links)); 626 BUG_ON(list_empty(tmp_links));
612 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, 627 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
613 cgrp_link_list); 628 link->cset = cset;
614 link->cg = cg;
615 link->cgrp = cgrp; 629 link->cgrp = cgrp;
616 atomic_inc(&cgrp->count); 630 list_move(&link->cset_link, &cgrp->cset_links);
617 list_move(&link->cgrp_link_list, &cgrp->css_sets);
618 /* 631 /*
619 * Always add links to the tail of the list so that the list 632 * Always add links to the tail of the list so that the list
620 * is sorted by order of hierarchy creation 633 * is sorted by order of hierarchy creation
621 */ 634 */
622 list_add_tail(&link->cg_link_list, &cg->cg_links); 635 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
623} 636}
624 637
625/* 638/**
626 * find_css_set() takes an existing cgroup group and a 639 * find_css_set - return a new css_set with one cgroup updated
627 * cgroup object, and returns a css_set object that's 640 * @old_cset: the baseline css_set
628 * equivalent to the old group, but with the given cgroup 641 * @cgrp: the cgroup to be updated
629 * substituted into the appropriate hierarchy. Must be called with 642 *
630 * cgroup_mutex held 643 * Return a new css_set that's equivalent to @old_cset, but with @cgrp
644 * substituted into the appropriate hierarchy.
631 */ 645 */
632static struct css_set *find_css_set( 646static struct css_set *find_css_set(struct css_set *old_cset,
633 struct css_set *oldcg, struct cgroup *cgrp) 647 struct cgroup *cgrp)
634{ 648{
635 struct css_set *res; 649 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
636 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 650 struct css_set *cset;
637 651 struct list_head tmp_links;
638 struct list_head tmp_cg_links; 652 struct cgrp_cset_link *link;
639
640 struct cg_cgroup_link *link;
641 unsigned long key; 653 unsigned long key;
642 654
655 lockdep_assert_held(&cgroup_mutex);
656
643 /* First see if we already have a cgroup group that matches 657 /* First see if we already have a cgroup group that matches
644 * the desired set */ 658 * the desired set */
645 read_lock(&css_set_lock); 659 read_lock(&css_set_lock);
646 res = find_existing_css_set(oldcg, cgrp, template); 660 cset = find_existing_css_set(old_cset, cgrp, template);
647 if (res) 661 if (cset)
648 get_css_set(res); 662 get_css_set(cset);
649 read_unlock(&css_set_lock); 663 read_unlock(&css_set_lock);
650 664
651 if (res) 665 if (cset)
652 return res; 666 return cset;
653 667
654 res = kmalloc(sizeof(*res), GFP_KERNEL); 668 cset = kzalloc(sizeof(*cset), GFP_KERNEL);
655 if (!res) 669 if (!cset)
656 return NULL; 670 return NULL;
657 671
658 /* Allocate all the cg_cgroup_link objects that we'll need */ 672 /* Allocate all the cgrp_cset_link objects that we'll need */
659 if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { 673 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
660 kfree(res); 674 kfree(cset);
661 return NULL; 675 return NULL;
662 } 676 }
663 677
664 atomic_set(&res->refcount, 1); 678 atomic_set(&cset->refcount, 1);
665 INIT_LIST_HEAD(&res->cg_links); 679 INIT_LIST_HEAD(&cset->cgrp_links);
666 INIT_LIST_HEAD(&res->tasks); 680 INIT_LIST_HEAD(&cset->tasks);
667 INIT_HLIST_NODE(&res->hlist); 681 INIT_HLIST_NODE(&cset->hlist);
668 682
669 /* Copy the set of subsystem state objects generated in 683 /* Copy the set of subsystem state objects generated in
670 * find_existing_css_set() */ 684 * find_existing_css_set() */
671 memcpy(res->subsys, template, sizeof(res->subsys)); 685 memcpy(cset->subsys, template, sizeof(cset->subsys));
672 686
673 write_lock(&css_set_lock); 687 write_lock(&css_set_lock);
674 /* Add reference counts and links from the new css_set. */ 688 /* Add reference counts and links from the new css_set. */
675 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { 689 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
676 struct cgroup *c = link->cgrp; 690 struct cgroup *c = link->cgrp;
691
677 if (c->root == cgrp->root) 692 if (c->root == cgrp->root)
678 c = cgrp; 693 c = cgrp;
679 link_css_set(&tmp_cg_links, res, c); 694 link_css_set(&tmp_links, cset, c);
680 } 695 }
681 696
682 BUG_ON(!list_empty(&tmp_cg_links)); 697 BUG_ON(!list_empty(&tmp_links));
683 698
684 css_set_count++; 699 css_set_count++;
685 700
686 /* Add this cgroup group to the hash table */ 701 /* Add this cgroup group to the hash table */
687 key = css_set_hash(res->subsys); 702 key = css_set_hash(cset->subsys);
688 hash_add(css_set_table, &res->hlist, key); 703 hash_add(css_set_table, &cset->hlist, key);
689 704
690 write_unlock(&css_set_lock); 705 write_unlock(&css_set_lock);
691 706
692 return res; 707 return cset;
693} 708}
694 709
695/* 710/*
@@ -699,7 +714,7 @@ static struct css_set *find_css_set(
699static struct cgroup *task_cgroup_from_root(struct task_struct *task, 714static struct cgroup *task_cgroup_from_root(struct task_struct *task,
700 struct cgroupfs_root *root) 715 struct cgroupfs_root *root)
701{ 716{
702 struct css_set *css; 717 struct css_set *cset;
703 struct cgroup *res = NULL; 718 struct cgroup *res = NULL;
704 719
705 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 720 BUG_ON(!mutex_is_locked(&cgroup_mutex));
@@ -709,13 +724,15 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
709 * task can't change groups, so the only thing that can happen 724 * task can't change groups, so the only thing that can happen
710 * is that it exits and its css is set back to init_css_set. 725 * is that it exits and its css is set back to init_css_set.
711 */ 726 */
712 css = task->cgroups; 727 cset = task_css_set(task);
713 if (css == &init_css_set) { 728 if (cset == &init_css_set) {
714 res = &root->top_cgroup; 729 res = &root->top_cgroup;
715 } else { 730 } else {
716 struct cg_cgroup_link *link; 731 struct cgrp_cset_link *link;
717 list_for_each_entry(link, &css->cg_links, cg_link_list) { 732
733 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
718 struct cgroup *c = link->cgrp; 734 struct cgroup *c = link->cgrp;
735
719 if (c->root == root) { 736 if (c->root == root) {
720 res = c; 737 res = c;
721 break; 738 break;
@@ -828,14 +845,14 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
828 845
829static void cgroup_free_fn(struct work_struct *work) 846static void cgroup_free_fn(struct work_struct *work)
830{ 847{
831 struct cgroup *cgrp = container_of(work, struct cgroup, free_work); 848 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
832 struct cgroup_subsys *ss; 849 struct cgroup_subsys *ss;
833 850
834 mutex_lock(&cgroup_mutex); 851 mutex_lock(&cgroup_mutex);
835 /* 852 /*
836 * Release the subsystem state objects. 853 * Release the subsystem state objects.
837 */ 854 */
838 for_each_subsys(cgrp->root, ss) 855 for_each_root_subsys(cgrp->root, ss)
839 ss->css_free(cgrp); 856 ss->css_free(cgrp);
840 857
841 cgrp->root->number_of_cgroups--; 858 cgrp->root->number_of_cgroups--;
@@ -873,7 +890,8 @@ static void cgroup_free_rcu(struct rcu_head *head)
873{ 890{
874 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); 891 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
875 892
876 schedule_work(&cgrp->free_work); 893 INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
894 schedule_work(&cgrp->destroy_work);
877} 895}
878 896
879static void cgroup_diput(struct dentry *dentry, struct inode *inode) 897static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -882,7 +900,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
882 if (S_ISDIR(inode->i_mode)) { 900 if (S_ISDIR(inode->i_mode)) {
883 struct cgroup *cgrp = dentry->d_fsdata; 901 struct cgroup *cgrp = dentry->d_fsdata;
884 902
885 BUG_ON(!(cgroup_is_removed(cgrp))); 903 BUG_ON(!(cgroup_is_dead(cgrp)));
886 call_rcu(&cgrp->rcu_head, cgroup_free_rcu); 904 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
887 } else { 905 } else {
888 struct cfent *cfe = __d_cfe(dentry); 906 struct cfent *cfe = __d_cfe(dentry);
@@ -950,7 +968,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
950 struct cgroup *cgrp = __d_cgrp(dir); 968 struct cgroup *cgrp = __d_cgrp(dir);
951 struct cgroup_subsys *ss; 969 struct cgroup_subsys *ss;
952 970
953 for_each_subsys(cgrp->root, ss) { 971 for_each_root_subsys(cgrp->root, ss) {
954 struct cftype_set *set; 972 struct cftype_set *set;
955 if (!test_bit(ss->subsys_id, &subsys_mask)) 973 if (!test_bit(ss->subsys_id, &subsys_mask))
956 continue; 974 continue;
@@ -988,30 +1006,23 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
988 * returns an error, no reference counts are touched. 1006 * returns an error, no reference counts are touched.
989 */ 1007 */
990static int rebind_subsystems(struct cgroupfs_root *root, 1008static int rebind_subsystems(struct cgroupfs_root *root,
991 unsigned long final_subsys_mask) 1009 unsigned long added_mask, unsigned removed_mask)
992{ 1010{
993 unsigned long added_mask, removed_mask;
994 struct cgroup *cgrp = &root->top_cgroup; 1011 struct cgroup *cgrp = &root->top_cgroup;
1012 struct cgroup_subsys *ss;
995 int i; 1013 int i;
996 1014
997 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1015 BUG_ON(!mutex_is_locked(&cgroup_mutex));
998 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 1016 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
999 1017
1000 removed_mask = root->actual_subsys_mask & ~final_subsys_mask;
1001 added_mask = final_subsys_mask & ~root->actual_subsys_mask;
1002 /* Check that any added subsystems are currently free */ 1018 /* Check that any added subsystems are currently free */
1003 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1019 for_each_subsys(ss, i) {
1004 unsigned long bit = 1UL << i; 1020 unsigned long bit = 1UL << i;
1005 struct cgroup_subsys *ss = subsys[i]; 1021
1006 if (!(bit & added_mask)) 1022 if (!(bit & added_mask))
1007 continue; 1023 continue;
1008 /* 1024
1009 * Nobody should tell us to do a subsys that doesn't exist: 1025 if (ss->root != &cgroup_dummy_root) {
1010 * parse_cgroupfs_options should catch that case and refcounts
1011 * ensure that subsystems won't disappear once selected.
1012 */
1013 BUG_ON(ss == NULL);
1014 if (ss->root != &rootnode) {
1015 /* Subsystem isn't free */ 1026 /* Subsystem isn't free */
1016 return -EBUSY; 1027 return -EBUSY;
1017 } 1028 }
@@ -1025,38 +1036,41 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1025 return -EBUSY; 1036 return -EBUSY;
1026 1037
1027 /* Process each subsystem */ 1038 /* Process each subsystem */
1028 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1039 for_each_subsys(ss, i) {
1029 struct cgroup_subsys *ss = subsys[i];
1030 unsigned long bit = 1UL << i; 1040 unsigned long bit = 1UL << i;
1041
1031 if (bit & added_mask) { 1042 if (bit & added_mask) {
1032 /* We're binding this subsystem to this hierarchy */ 1043 /* We're binding this subsystem to this hierarchy */
1033 BUG_ON(ss == NULL);
1034 BUG_ON(cgrp->subsys[i]); 1044 BUG_ON(cgrp->subsys[i]);
1035 BUG_ON(!dummytop->subsys[i]); 1045 BUG_ON(!cgroup_dummy_top->subsys[i]);
1036 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 1046 BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top);
1037 cgrp->subsys[i] = dummytop->subsys[i]; 1047
1048 cgrp->subsys[i] = cgroup_dummy_top->subsys[i];
1038 cgrp->subsys[i]->cgroup = cgrp; 1049 cgrp->subsys[i]->cgroup = cgrp;
1039 list_move(&ss->sibling, &root->subsys_list); 1050 list_move(&ss->sibling, &root->subsys_list);
1040 ss->root = root; 1051 ss->root = root;
1041 if (ss->bind) 1052 if (ss->bind)
1042 ss->bind(cgrp); 1053 ss->bind(cgrp);
1054
1043 /* refcount was already taken, and we're keeping it */ 1055 /* refcount was already taken, and we're keeping it */
1056 root->subsys_mask |= bit;
1044 } else if (bit & removed_mask) { 1057 } else if (bit & removed_mask) {
1045 /* We're removing this subsystem */ 1058 /* We're removing this subsystem */
1046 BUG_ON(ss == NULL); 1059 BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]);
1047 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
1048 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1060 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
1061
1049 if (ss->bind) 1062 if (ss->bind)
1050 ss->bind(dummytop); 1063 ss->bind(cgroup_dummy_top);
1051 dummytop->subsys[i]->cgroup = dummytop; 1064 cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top;
1052 cgrp->subsys[i] = NULL; 1065 cgrp->subsys[i] = NULL;
1053 subsys[i]->root = &rootnode; 1066 cgroup_subsys[i]->root = &cgroup_dummy_root;
1054 list_move(&ss->sibling, &rootnode.subsys_list); 1067 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1068
1055 /* subsystem is now free - drop reference on module */ 1069 /* subsystem is now free - drop reference on module */
1056 module_put(ss->module); 1070 module_put(ss->module);
1057 } else if (bit & final_subsys_mask) { 1071 root->subsys_mask &= ~bit;
1072 } else if (bit & root->subsys_mask) {
1058 /* Subsystem state should already exist */ 1073 /* Subsystem state should already exist */
1059 BUG_ON(ss == NULL);
1060 BUG_ON(!cgrp->subsys[i]); 1074 BUG_ON(!cgrp->subsys[i]);
1061 /* 1075 /*
1062 * a refcount was taken, but we already had one, so 1076 * a refcount was taken, but we already had one, so
@@ -1071,7 +1085,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1071 BUG_ON(cgrp->subsys[i]); 1085 BUG_ON(cgrp->subsys[i]);
1072 } 1086 }
1073 } 1087 }
1074 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; 1088
1089 /*
1090 * Mark @root has finished binding subsystems. @root->subsys_mask
1091 * now matches the bound subsystems.
1092 */
1093 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1075 1094
1076 return 0; 1095 return 0;
1077} 1096}
@@ -1082,7 +1101,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1082 struct cgroup_subsys *ss; 1101 struct cgroup_subsys *ss;
1083 1102
1084 mutex_lock(&cgroup_root_mutex); 1103 mutex_lock(&cgroup_root_mutex);
1085 for_each_subsys(root, ss) 1104 for_each_root_subsys(root, ss)
1086 seq_printf(seq, ",%s", ss->name); 1105 seq_printf(seq, ",%s", ss->name);
1087 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1106 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1088 seq_puts(seq, ",sane_behavior"); 1107 seq_puts(seq, ",sane_behavior");
@@ -1114,18 +1133,19 @@ struct cgroup_sb_opts {
1114}; 1133};
1115 1134
1116/* 1135/*
1117 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call 1136 * Convert a hierarchy specifier into a bitmask of subsystems and
1118 * with cgroup_mutex held to protect the subsys[] array. This function takes 1137 * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
1119 * refcounts on subsystems to be used, unless it returns error, in which case 1138 * array. This function takes refcounts on subsystems to be used, unless it
1120 * no refcounts are taken. 1139 * returns error, in which case no refcounts are taken.
1121 */ 1140 */
1122static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1141static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1123{ 1142{
1124 char *token, *o = data; 1143 char *token, *o = data;
1125 bool all_ss = false, one_ss = false; 1144 bool all_ss = false, one_ss = false;
1126 unsigned long mask = (unsigned long)-1; 1145 unsigned long mask = (unsigned long)-1;
1127 int i;
1128 bool module_pin_failed = false; 1146 bool module_pin_failed = false;
1147 struct cgroup_subsys *ss;
1148 int i;
1129 1149
1130 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1150 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1131 1151
@@ -1202,10 +1222,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1202 continue; 1222 continue;
1203 } 1223 }
1204 1224
1205 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1225 for_each_subsys(ss, i) {
1206 struct cgroup_subsys *ss = subsys[i];
1207 if (ss == NULL)
1208 continue;
1209 if (strcmp(token, ss->name)) 1226 if (strcmp(token, ss->name))
1210 continue; 1227 continue;
1211 if (ss->disabled) 1228 if (ss->disabled)
@@ -1228,16 +1245,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1228 * otherwise if 'none', 'name=' and a subsystem name options 1245 * otherwise if 'none', 'name=' and a subsystem name options
1229 * were not specified, let's default to 'all' 1246 * were not specified, let's default to 'all'
1230 */ 1247 */
1231 if (all_ss || (!one_ss && !opts->none && !opts->name)) { 1248 if (all_ss || (!one_ss && !opts->none && !opts->name))
1232 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1249 for_each_subsys(ss, i)
1233 struct cgroup_subsys *ss = subsys[i]; 1250 if (!ss->disabled)
1234 if (ss == NULL) 1251 set_bit(i, &opts->subsys_mask);
1235 continue;
1236 if (ss->disabled)
1237 continue;
1238 set_bit(i, &opts->subsys_mask);
1239 }
1240 }
1241 1252
1242 /* Consistency checks */ 1253 /* Consistency checks */
1243 1254
@@ -1281,12 +1292,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1281 * take duplicate reference counts on a subsystem that's already used, 1292 * take duplicate reference counts on a subsystem that's already used,
1282 * but rebind_subsystems handles this case. 1293 * but rebind_subsystems handles this case.
1283 */ 1294 */
1284 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1295 for_each_subsys(ss, i) {
1285 unsigned long bit = 1UL << i; 1296 if (!(opts->subsys_mask & (1UL << i)))
1286
1287 if (!(bit & opts->subsys_mask))
1288 continue; 1297 continue;
1289 if (!try_module_get(subsys[i]->module)) { 1298 if (!try_module_get(cgroup_subsys[i]->module)) {
1290 module_pin_failed = true; 1299 module_pin_failed = true;
1291 break; 1300 break;
1292 } 1301 }
@@ -1303,7 +1312,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1303 1312
1304 if (!(bit & opts->subsys_mask)) 1313 if (!(bit & opts->subsys_mask))
1305 continue; 1314 continue;
1306 module_put(subsys[i]->module); 1315 module_put(cgroup_subsys[i]->module);
1307 } 1316 }
1308 return -ENOENT; 1317 return -ENOENT;
1309 } 1318 }
@@ -1313,14 +1322,14 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1313 1322
1314static void drop_parsed_module_refcounts(unsigned long subsys_mask) 1323static void drop_parsed_module_refcounts(unsigned long subsys_mask)
1315{ 1324{
1325 struct cgroup_subsys *ss;
1316 int i; 1326 int i;
1317 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1318 unsigned long bit = 1UL << i;
1319 1327
1320 if (!(bit & subsys_mask)) 1328 mutex_lock(&cgroup_mutex);
1321 continue; 1329 for_each_subsys(ss, i)
1322 module_put(subsys[i]->module); 1330 if (subsys_mask & (1UL << i))
1323 } 1331 module_put(cgroup_subsys[i]->module);
1332 mutex_unlock(&cgroup_mutex);
1324} 1333}
1325 1334
1326static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1335static int cgroup_remount(struct super_block *sb, int *flags, char *data)
@@ -1345,7 +1354,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1345 if (ret) 1354 if (ret)
1346 goto out_unlock; 1355 goto out_unlock;
1347 1356
1348 if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) 1357 if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1349 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1358 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1350 task_tgid_nr(current), current->comm); 1359 task_tgid_nr(current), current->comm);
1351 1360
@@ -1353,10 +1362,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1353 removed_mask = root->subsys_mask & ~opts.subsys_mask; 1362 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1354 1363
1355 /* Don't allow flags or name to change at remount */ 1364 /* Don't allow flags or name to change at remount */
1356 if (opts.flags != root->flags || 1365 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
1357 (opts.name && strcmp(opts.name, root->name))) { 1366 (opts.name && strcmp(opts.name, root->name))) {
1367 pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
1368 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
1369 root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1358 ret = -EINVAL; 1370 ret = -EINVAL;
1359 drop_parsed_module_refcounts(opts.subsys_mask);
1360 goto out_unlock; 1371 goto out_unlock;
1361 } 1372 }
1362 1373
@@ -1367,11 +1378,10 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1367 */ 1378 */
1368 cgroup_clear_directory(cgrp->dentry, false, removed_mask); 1379 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1369 1380
1370 ret = rebind_subsystems(root, opts.subsys_mask); 1381 ret = rebind_subsystems(root, added_mask, removed_mask);
1371 if (ret) { 1382 if (ret) {
1372 /* rebind_subsystems failed, re-populate the removed files */ 1383 /* rebind_subsystems failed, re-populate the removed files */
1373 cgroup_populate_dir(cgrp, false, removed_mask); 1384 cgroup_populate_dir(cgrp, false, removed_mask);
1374 drop_parsed_module_refcounts(opts.subsys_mask);
1375 goto out_unlock; 1385 goto out_unlock;
1376 } 1386 }
1377 1387
@@ -1386,6 +1396,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1386 mutex_unlock(&cgroup_root_mutex); 1396 mutex_unlock(&cgroup_root_mutex);
1387 mutex_unlock(&cgroup_mutex); 1397 mutex_unlock(&cgroup_mutex);
1388 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1398 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1399 if (ret)
1400 drop_parsed_module_refcounts(opts.subsys_mask);
1389 return ret; 1401 return ret;
1390} 1402}
1391 1403
@@ -1401,11 +1413,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1401 INIT_LIST_HEAD(&cgrp->sibling); 1413 INIT_LIST_HEAD(&cgrp->sibling);
1402 INIT_LIST_HEAD(&cgrp->children); 1414 INIT_LIST_HEAD(&cgrp->children);
1403 INIT_LIST_HEAD(&cgrp->files); 1415 INIT_LIST_HEAD(&cgrp->files);
1404 INIT_LIST_HEAD(&cgrp->css_sets); 1416 INIT_LIST_HEAD(&cgrp->cset_links);
1405 INIT_LIST_HEAD(&cgrp->allcg_node);
1406 INIT_LIST_HEAD(&cgrp->release_list); 1417 INIT_LIST_HEAD(&cgrp->release_list);
1407 INIT_LIST_HEAD(&cgrp->pidlists); 1418 INIT_LIST_HEAD(&cgrp->pidlists);
1408 INIT_WORK(&cgrp->free_work, cgroup_free_fn);
1409 mutex_init(&cgrp->pidlist_mutex); 1419 mutex_init(&cgrp->pidlist_mutex);
1410 INIT_LIST_HEAD(&cgrp->event_list); 1420 INIT_LIST_HEAD(&cgrp->event_list);
1411 spin_lock_init(&cgrp->event_list_lock); 1421 spin_lock_init(&cgrp->event_list_lock);
@@ -1418,37 +1428,37 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1418 1428
1419 INIT_LIST_HEAD(&root->subsys_list); 1429 INIT_LIST_HEAD(&root->subsys_list);
1420 INIT_LIST_HEAD(&root->root_list); 1430 INIT_LIST_HEAD(&root->root_list);
1421 INIT_LIST_HEAD(&root->allcg_list);
1422 root->number_of_cgroups = 1; 1431 root->number_of_cgroups = 1;
1423 cgrp->root = root; 1432 cgrp->root = root;
1424 cgrp->name = &root_cgroup_name; 1433 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1425 init_cgroup_housekeeping(cgrp); 1434 init_cgroup_housekeeping(cgrp);
1426 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1427} 1435}
1428 1436
1429static bool init_root_id(struct cgroupfs_root *root) 1437static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
1430{ 1438{
1431 int ret = 0; 1439 int id;
1432 1440
1433 do { 1441 lockdep_assert_held(&cgroup_mutex);
1434 if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) 1442 lockdep_assert_held(&cgroup_root_mutex);
1435 return false; 1443
1436 spin_lock(&hierarchy_id_lock); 1444 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
1437 /* Try to allocate the next unused ID */ 1445 GFP_KERNEL);
1438 ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, 1446 if (id < 0)
1439 &root->hierarchy_id); 1447 return id;
1440 if (ret == -ENOSPC) 1448
1441 /* Try again starting from 0 */ 1449 root->hierarchy_id = id;
1442 ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); 1450 return 0;
1443 if (!ret) { 1451}
1444 next_hierarchy_id = root->hierarchy_id + 1; 1452
1445 } else if (ret != -EAGAIN) { 1453static void cgroup_exit_root_id(struct cgroupfs_root *root)
1446 /* Can only get here if the 31-bit IDR is full ... */ 1454{
1447 BUG_ON(ret); 1455 lockdep_assert_held(&cgroup_mutex);
1448 } 1456 lockdep_assert_held(&cgroup_root_mutex);
1449 spin_unlock(&hierarchy_id_lock); 1457
1450 } while (ret); 1458 if (root->hierarchy_id) {
1451 return true; 1459 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1460 root->hierarchy_id = 0;
1461 }
1452} 1462}
1453 1463
1454static int cgroup_test_super(struct super_block *sb, void *data) 1464static int cgroup_test_super(struct super_block *sb, void *data)
@@ -1482,12 +1492,16 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1482 if (!root) 1492 if (!root)
1483 return ERR_PTR(-ENOMEM); 1493 return ERR_PTR(-ENOMEM);
1484 1494
1485 if (!init_root_id(root)) {
1486 kfree(root);
1487 return ERR_PTR(-ENOMEM);
1488 }
1489 init_cgroup_root(root); 1495 init_cgroup_root(root);
1490 1496
1497 /*
1498 * We need to set @root->subsys_mask now so that @root can be
1499 * matched by cgroup_test_super() before it finishes
1500 * initialization; otherwise, competing mounts with the same
1501 * options may try to bind the same subsystems instead of waiting
1502 * for the first one leading to unexpected mount errors.
1503 * SUBSYS_BOUND will be set once actual binding is complete.
1504 */
1491 root->subsys_mask = opts->subsys_mask; 1505 root->subsys_mask = opts->subsys_mask;
1492 root->flags = opts->flags; 1506 root->flags = opts->flags;
1493 ida_init(&root->cgroup_ida); 1507 ida_init(&root->cgroup_ida);
@@ -1500,17 +1514,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1500 return root; 1514 return root;
1501} 1515}
1502 1516
1503static void cgroup_drop_root(struct cgroupfs_root *root) 1517static void cgroup_free_root(struct cgroupfs_root *root)
1504{ 1518{
1505 if (!root) 1519 if (root) {
1506 return; 1520 /* hierarhcy ID shoulid already have been released */
1521 WARN_ON_ONCE(root->hierarchy_id);
1507 1522
1508 BUG_ON(!root->hierarchy_id); 1523 ida_destroy(&root->cgroup_ida);
1509 spin_lock(&hierarchy_id_lock); 1524 kfree(root);
1510 ida_remove(&hierarchy_ida, root->hierarchy_id); 1525 }
1511 spin_unlock(&hierarchy_id_lock);
1512 ida_destroy(&root->cgroup_ida);
1513 kfree(root);
1514} 1526}
1515 1527
1516static int cgroup_set_super(struct super_block *sb, void *data) 1528static int cgroup_set_super(struct super_block *sb, void *data)
@@ -1597,7 +1609,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1597 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); 1609 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
1598 if (IS_ERR(sb)) { 1610 if (IS_ERR(sb)) {
1599 ret = PTR_ERR(sb); 1611 ret = PTR_ERR(sb);
1600 cgroup_drop_root(opts.new_root); 1612 cgroup_free_root(opts.new_root);
1601 goto drop_modules; 1613 goto drop_modules;
1602 } 1614 }
1603 1615
@@ -1605,12 +1617,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1605 BUG_ON(!root); 1617 BUG_ON(!root);
1606 if (root == opts.new_root) { 1618 if (root == opts.new_root) {
1607 /* We used the new root structure, so this is a new hierarchy */ 1619 /* We used the new root structure, so this is a new hierarchy */
1608 struct list_head tmp_cg_links; 1620 struct list_head tmp_links;
1609 struct cgroup *root_cgrp = &root->top_cgroup; 1621 struct cgroup *root_cgrp = &root->top_cgroup;
1610 struct cgroupfs_root *existing_root; 1622 struct cgroupfs_root *existing_root;
1611 const struct cred *cred; 1623 const struct cred *cred;
1612 int i; 1624 int i;
1613 struct css_set *cg; 1625 struct css_set *cset;
1614 1626
1615 BUG_ON(sb->s_root != NULL); 1627 BUG_ON(sb->s_root != NULL);
1616 1628
@@ -1637,13 +1649,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1637 * that's us. The worst that can happen is that we 1649 * that's us. The worst that can happen is that we
1638 * have some link structures left over 1650 * have some link structures left over
1639 */ 1651 */
1640 ret = allocate_cg_links(css_set_count, &tmp_cg_links); 1652 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1641 if (ret) 1653 if (ret)
1642 goto unlock_drop; 1654 goto unlock_drop;
1643 1655
1644 ret = rebind_subsystems(root, root->subsys_mask); 1656 /* ID 0 is reserved for dummy root, 1 for unified hierarchy */
1657 ret = cgroup_init_root_id(root, 2, 0);
1658 if (ret)
1659 goto unlock_drop;
1660
1661 ret = rebind_subsystems(root, root->subsys_mask, 0);
1645 if (ret == -EBUSY) { 1662 if (ret == -EBUSY) {
1646 free_cg_links(&tmp_cg_links); 1663 free_cgrp_cset_links(&tmp_links);
1647 goto unlock_drop; 1664 goto unlock_drop;
1648 } 1665 }
1649 /* 1666 /*
@@ -1655,8 +1672,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1655 /* EBUSY should be the only error here */ 1672 /* EBUSY should be the only error here */
1656 BUG_ON(ret); 1673 BUG_ON(ret);
1657 1674
1658 list_add(&root->root_list, &roots); 1675 list_add(&root->root_list, &cgroup_roots);
1659 root_count++; 1676 cgroup_root_count++;
1660 1677
1661 sb->s_root->d_fsdata = root_cgrp; 1678 sb->s_root->d_fsdata = root_cgrp;
1662 root->top_cgroup.dentry = sb->s_root; 1679 root->top_cgroup.dentry = sb->s_root;
@@ -1664,11 +1681,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1664 /* Link the top cgroup in this hierarchy into all 1681 /* Link the top cgroup in this hierarchy into all
1665 * the css_set objects */ 1682 * the css_set objects */
1666 write_lock(&css_set_lock); 1683 write_lock(&css_set_lock);
1667 hash_for_each(css_set_table, i, cg, hlist) 1684 hash_for_each(css_set_table, i, cset, hlist)
1668 link_css_set(&tmp_cg_links, cg, root_cgrp); 1685 link_css_set(&tmp_links, cset, root_cgrp);
1669 write_unlock(&css_set_lock); 1686 write_unlock(&css_set_lock);
1670 1687
1671 free_cg_links(&tmp_cg_links); 1688 free_cgrp_cset_links(&tmp_links);
1672 1689
1673 BUG_ON(!list_empty(&root_cgrp->children)); 1690 BUG_ON(!list_empty(&root_cgrp->children));
1674 BUG_ON(root->number_of_cgroups != 1); 1691 BUG_ON(root->number_of_cgroups != 1);
@@ -1684,9 +1701,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1684 * We re-used an existing hierarchy - the new root (if 1701 * We re-used an existing hierarchy - the new root (if
1685 * any) is not needed 1702 * any) is not needed
1686 */ 1703 */
1687 cgroup_drop_root(opts.new_root); 1704 cgroup_free_root(opts.new_root);
1688 1705
1689 if (root->flags != opts.flags) { 1706 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1690 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1707 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1691 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); 1708 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1692 ret = -EINVAL; 1709 ret = -EINVAL;
@@ -1705,6 +1722,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1705 return dget(sb->s_root); 1722 return dget(sb->s_root);
1706 1723
1707 unlock_drop: 1724 unlock_drop:
1725 cgroup_exit_root_id(root);
1708 mutex_unlock(&cgroup_root_mutex); 1726 mutex_unlock(&cgroup_root_mutex);
1709 mutex_unlock(&cgroup_mutex); 1727 mutex_unlock(&cgroup_mutex);
1710 mutex_unlock(&inode->i_mutex); 1728 mutex_unlock(&inode->i_mutex);
@@ -1721,9 +1739,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1721static void cgroup_kill_sb(struct super_block *sb) { 1739static void cgroup_kill_sb(struct super_block *sb) {
1722 struct cgroupfs_root *root = sb->s_fs_info; 1740 struct cgroupfs_root *root = sb->s_fs_info;
1723 struct cgroup *cgrp = &root->top_cgroup; 1741 struct cgroup *cgrp = &root->top_cgroup;
1742 struct cgrp_cset_link *link, *tmp_link;
1724 int ret; 1743 int ret;
1725 struct cg_cgroup_link *link;
1726 struct cg_cgroup_link *saved_link;
1727 1744
1728 BUG_ON(!root); 1745 BUG_ON(!root);
1729 1746
@@ -1734,36 +1751,39 @@ static void cgroup_kill_sb(struct super_block *sb) {
1734 mutex_lock(&cgroup_root_mutex); 1751 mutex_lock(&cgroup_root_mutex);
1735 1752
1736 /* Rebind all subsystems back to the default hierarchy */ 1753 /* Rebind all subsystems back to the default hierarchy */
1737 ret = rebind_subsystems(root, 0); 1754 if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
1738 /* Shouldn't be able to fail ... */ 1755 ret = rebind_subsystems(root, 0, root->subsys_mask);
1739 BUG_ON(ret); 1756 /* Shouldn't be able to fail ... */
1757 BUG_ON(ret);
1758 }
1740 1759
1741 /* 1760 /*
1742 * Release all the links from css_sets to this hierarchy's 1761 * Release all the links from cset_links to this hierarchy's
1743 * root cgroup 1762 * root cgroup
1744 */ 1763 */
1745 write_lock(&css_set_lock); 1764 write_lock(&css_set_lock);
1746 1765
1747 list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, 1766 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1748 cgrp_link_list) { 1767 list_del(&link->cset_link);
1749 list_del(&link->cg_link_list); 1768 list_del(&link->cgrp_link);
1750 list_del(&link->cgrp_link_list);
1751 kfree(link); 1769 kfree(link);
1752 } 1770 }
1753 write_unlock(&css_set_lock); 1771 write_unlock(&css_set_lock);
1754 1772
1755 if (!list_empty(&root->root_list)) { 1773 if (!list_empty(&root->root_list)) {
1756 list_del(&root->root_list); 1774 list_del(&root->root_list);
1757 root_count--; 1775 cgroup_root_count--;
1758 } 1776 }
1759 1777
1778 cgroup_exit_root_id(root);
1779
1760 mutex_unlock(&cgroup_root_mutex); 1780 mutex_unlock(&cgroup_root_mutex);
1761 mutex_unlock(&cgroup_mutex); 1781 mutex_unlock(&cgroup_mutex);
1762 1782
1763 simple_xattrs_free(&cgrp->xattrs); 1783 simple_xattrs_free(&cgrp->xattrs);
1764 1784
1765 kill_litter_super(sb); 1785 kill_litter_super(sb);
1766 cgroup_drop_root(root); 1786 cgroup_free_root(root);
1767} 1787}
1768 1788
1769static struct file_system_type cgroup_fs_type = { 1789static struct file_system_type cgroup_fs_type = {
@@ -1825,6 +1845,38 @@ out:
1825} 1845}
1826EXPORT_SYMBOL_GPL(cgroup_path); 1846EXPORT_SYMBOL_GPL(cgroup_path);
1827 1847
1848/**
1849 * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy
1850 * @task: target task
1851 * @hierarchy_id: the hierarchy to look up @task's cgroup from
1852 * @buf: the buffer to write the path into
1853 * @buflen: the length of the buffer
1854 *
1855 * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and
1856 * copy its path into @buf. This function grabs cgroup_mutex and shouldn't
1857 * be used inside locks used by cgroup controller callbacks.
1858 */
1859int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id,
1860 char *buf, size_t buflen)
1861{
1862 struct cgroupfs_root *root;
1863 struct cgroup *cgrp = NULL;
1864 int ret = -ENOENT;
1865
1866 mutex_lock(&cgroup_mutex);
1867
1868 root = idr_find(&cgroup_hierarchy_idr, hierarchy_id);
1869 if (root) {
1870 cgrp = task_cgroup_from_root(task, root);
1871 ret = cgroup_path(cgrp, buf, buflen);
1872 }
1873
1874 mutex_unlock(&cgroup_mutex);
1875
1876 return ret;
1877}
1878EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy);
1879
1828/* 1880/*
1829 * Control Group taskset 1881 * Control Group taskset
1830 */ 1882 */
@@ -1910,10 +1962,11 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1910 * 1962 *
1911 * Must be called with cgroup_mutex and threadgroup locked. 1963 * Must be called with cgroup_mutex and threadgroup locked.
1912 */ 1964 */
1913static void cgroup_task_migrate(struct cgroup *oldcgrp, 1965static void cgroup_task_migrate(struct cgroup *old_cgrp,
1914 struct task_struct *tsk, struct css_set *newcg) 1966 struct task_struct *tsk,
1967 struct css_set *new_cset)
1915{ 1968{
1916 struct css_set *oldcg; 1969 struct css_set *old_cset;
1917 1970
1918 /* 1971 /*
1919 * We are synchronized through threadgroup_lock() against PF_EXITING 1972 * We are synchronized through threadgroup_lock() against PF_EXITING
@@ -1921,25 +1974,25 @@ static void cgroup_task_migrate(struct cgroup *oldcgrp,
1921 * css_set to init_css_set and dropping the old one. 1974 * css_set to init_css_set and dropping the old one.
1922 */ 1975 */
1923 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1976 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1924 oldcg = tsk->cgroups; 1977 old_cset = task_css_set(tsk);
1925 1978
1926 task_lock(tsk); 1979 task_lock(tsk);
1927 rcu_assign_pointer(tsk->cgroups, newcg); 1980 rcu_assign_pointer(tsk->cgroups, new_cset);
1928 task_unlock(tsk); 1981 task_unlock(tsk);
1929 1982
1930 /* Update the css_set linked lists if we're using them */ 1983 /* Update the css_set linked lists if we're using them */
1931 write_lock(&css_set_lock); 1984 write_lock(&css_set_lock);
1932 if (!list_empty(&tsk->cg_list)) 1985 if (!list_empty(&tsk->cg_list))
1933 list_move(&tsk->cg_list, &newcg->tasks); 1986 list_move(&tsk->cg_list, &new_cset->tasks);
1934 write_unlock(&css_set_lock); 1987 write_unlock(&css_set_lock);
1935 1988
1936 /* 1989 /*
1937 * We just gained a reference on oldcg by taking it from the task. As 1990 * We just gained a reference on old_cset by taking it from the
1938 * trading it for newcg is protected by cgroup_mutex, we're safe to drop 1991 * task. As trading it for new_cset is protected by cgroup_mutex,
1939 * it here; it will be freed under RCU. 1992 * we're safe to drop it here; it will be freed under RCU.
1940 */ 1993 */
1941 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1994 set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
1942 put_css_set(oldcg); 1995 put_css_set(old_cset);
1943} 1996}
1944 1997
1945/** 1998/**
@@ -2029,7 +2082,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2029 /* 2082 /*
2030 * step 1: check that we can legitimately attach to the cgroup. 2083 * step 1: check that we can legitimately attach to the cgroup.
2031 */ 2084 */
2032 for_each_subsys(root, ss) { 2085 for_each_root_subsys(root, ss) {
2033 if (ss->can_attach) { 2086 if (ss->can_attach) {
2034 retval = ss->can_attach(cgrp, &tset); 2087 retval = ss->can_attach(cgrp, &tset);
2035 if (retval) { 2088 if (retval) {
@@ -2044,8 +2097,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2044 * we use find_css_set, which allocates a new one if necessary. 2097 * we use find_css_set, which allocates a new one if necessary.
2045 */ 2098 */
2046 for (i = 0; i < group_size; i++) { 2099 for (i = 0; i < group_size; i++) {
2100 struct css_set *old_cset;
2101
2047 tc = flex_array_get(group, i); 2102 tc = flex_array_get(group, i);
2048 tc->cg = find_css_set(tc->task->cgroups, cgrp); 2103 old_cset = task_css_set(tc->task);
2104 tc->cg = find_css_set(old_cset, cgrp);
2049 if (!tc->cg) { 2105 if (!tc->cg) {
2050 retval = -ENOMEM; 2106 retval = -ENOMEM;
2051 goto out_put_css_set_refs; 2107 goto out_put_css_set_refs;
@@ -2066,7 +2122,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2066 /* 2122 /*
2067 * step 4: do subsystem attach callbacks. 2123 * step 4: do subsystem attach callbacks.
2068 */ 2124 */
2069 for_each_subsys(root, ss) { 2125 for_each_root_subsys(root, ss) {
2070 if (ss->attach) 2126 if (ss->attach)
2071 ss->attach(cgrp, &tset); 2127 ss->attach(cgrp, &tset);
2072 } 2128 }
@@ -2086,7 +2142,7 @@ out_put_css_set_refs:
2086 } 2142 }
2087out_cancel_attach: 2143out_cancel_attach:
2088 if (retval) { 2144 if (retval) {
2089 for_each_subsys(root, ss) { 2145 for_each_root_subsys(root, ss) {
2090 if (ss == failed_ss) 2146 if (ss == failed_ss)
2091 break; 2147 break;
2092 if (ss->cancel_attach) 2148 if (ss->cancel_attach)
@@ -2323,7 +2379,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
2323 struct cftype *cft = __d_cft(file->f_dentry); 2379 struct cftype *cft = __d_cft(file->f_dentry);
2324 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2380 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2325 2381
2326 if (cgroup_is_removed(cgrp)) 2382 if (cgroup_is_dead(cgrp))
2327 return -ENODEV; 2383 return -ENODEV;
2328 if (cft->write) 2384 if (cft->write)
2329 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 2385 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -2368,7 +2424,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2368 struct cftype *cft = __d_cft(file->f_dentry); 2424 struct cftype *cft = __d_cft(file->f_dentry);
2369 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2425 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2370 2426
2371 if (cgroup_is_removed(cgrp)) 2427 if (cgroup_is_dead(cgrp))
2372 return -ENODEV; 2428 return -ENODEV;
2373 2429
2374 if (cft->read) 2430 if (cft->read)
@@ -2435,10 +2491,12 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
2435 cft = __d_cft(file->f_dentry); 2491 cft = __d_cft(file->f_dentry);
2436 2492
2437 if (cft->read_map || cft->read_seq_string) { 2493 if (cft->read_map || cft->read_seq_string) {
2438 struct cgroup_seqfile_state *state = 2494 struct cgroup_seqfile_state *state;
2439 kzalloc(sizeof(*state), GFP_USER); 2495
2496 state = kzalloc(sizeof(*state), GFP_USER);
2440 if (!state) 2497 if (!state)
2441 return -ENOMEM; 2498 return -ENOMEM;
2499
2442 state->cft = cft; 2500 state->cft = cft;
2443 state->cgroup = __d_cgrp(file->f_dentry->d_parent); 2501 state->cgroup = __d_cgrp(file->f_dentry->d_parent);
2444 file->f_op = &cgroup_seqfile_operations; 2502 file->f_op = &cgroup_seqfile_operations;
@@ -2486,6 +2544,13 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2486 2544
2487 cgrp = __d_cgrp(old_dentry); 2545 cgrp = __d_cgrp(old_dentry);
2488 2546
2547 /*
2548 * This isn't a proper migration and its usefulness is very
2549 * limited. Disallow if sane_behavior.
2550 */
2551 if (cgroup_sane_behavior(cgrp))
2552 return -EPERM;
2553
2489 name = cgroup_alloc_name(new_dentry); 2554 name = cgroup_alloc_name(new_dentry);
2490 if (!name) 2555 if (!name)
2491 return -ENOMEM; 2556 return -ENOMEM;
@@ -2496,7 +2561,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2496 return ret; 2561 return ret;
2497 } 2562 }
2498 2563
2499 old_name = cgrp->name; 2564 old_name = rcu_dereference_protected(cgrp->name, true);
2500 rcu_assign_pointer(cgrp->name, name); 2565 rcu_assign_pointer(cgrp->name, name);
2501 2566
2502 kfree_rcu(old_name, rcu_head); 2567 kfree_rcu(old_name, rcu_head);
@@ -2747,58 +2812,78 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2747 return ret; 2812 return ret;
2748} 2813}
2749 2814
2750static DEFINE_MUTEX(cgroup_cft_mutex);
2751
2752static void cgroup_cfts_prepare(void) 2815static void cgroup_cfts_prepare(void)
2753 __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) 2816 __acquires(&cgroup_mutex)
2754{ 2817{
2755 /* 2818 /*
2756 * Thanks to the entanglement with vfs inode locking, we can't walk 2819 * Thanks to the entanglement with vfs inode locking, we can't walk
2757 * the existing cgroups under cgroup_mutex and create files. 2820 * the existing cgroups under cgroup_mutex and create files.
2758 * Instead, we increment reference on all cgroups and build list of 2821 * Instead, we use cgroup_for_each_descendant_pre() and drop RCU
2759 * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure 2822 * read lock before calling cgroup_addrm_files().
2760 * exclusive access to the field.
2761 */ 2823 */
2762 mutex_lock(&cgroup_cft_mutex);
2763 mutex_lock(&cgroup_mutex); 2824 mutex_lock(&cgroup_mutex);
2764} 2825}
2765 2826
2766static void cgroup_cfts_commit(struct cgroup_subsys *ss, 2827static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2767 struct cftype *cfts, bool is_add) 2828 struct cftype *cfts, bool is_add)
2768 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) 2829 __releases(&cgroup_mutex)
2769{ 2830{
2770 LIST_HEAD(pending); 2831 LIST_HEAD(pending);
2771 struct cgroup *cgrp, *n; 2832 struct cgroup *cgrp, *root = &ss->root->top_cgroup;
2833 struct super_block *sb = ss->root->sb;
2834 struct dentry *prev = NULL;
2835 struct inode *inode;
2836 u64 update_before;
2772 2837
2773 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2838 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2774 if (cfts && ss->root != &rootnode) { 2839 if (!cfts || ss->root == &cgroup_dummy_root ||
2775 list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { 2840 !atomic_inc_not_zero(&sb->s_active)) {
2776 dget(cgrp->dentry); 2841 mutex_unlock(&cgroup_mutex);
2777 list_add_tail(&cgrp->cft_q_node, &pending); 2842 return;
2778 }
2779 } 2843 }
2780 2844
2781 mutex_unlock(&cgroup_mutex);
2782
2783 /* 2845 /*
2784 * All new cgroups will see @cfts update on @ss->cftsets. Add/rm 2846 * All cgroups which are created after we drop cgroup_mutex will
2785 * files for all cgroups which were created before. 2847 * have the updated set of files, so we only need to update the
2848 * cgroups created before the current @cgroup_serial_nr_next.
2786 */ 2849 */
2787 list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { 2850 update_before = cgroup_serial_nr_next;
2788 struct inode *inode = cgrp->dentry->d_inode; 2851
2852 mutex_unlock(&cgroup_mutex);
2853
2854 /* @root always needs to be updated */
2855 inode = root->dentry->d_inode;
2856 mutex_lock(&inode->i_mutex);
2857 mutex_lock(&cgroup_mutex);
2858 cgroup_addrm_files(root, ss, cfts, is_add);
2859 mutex_unlock(&cgroup_mutex);
2860 mutex_unlock(&inode->i_mutex);
2861
2862 /* add/rm files for all cgroups created before */
2863 rcu_read_lock();
2864 cgroup_for_each_descendant_pre(cgrp, root) {
2865 if (cgroup_is_dead(cgrp))
2866 continue;
2867
2868 inode = cgrp->dentry->d_inode;
2869 dget(cgrp->dentry);
2870 rcu_read_unlock();
2871
2872 dput(prev);
2873 prev = cgrp->dentry;
2789 2874
2790 mutex_lock(&inode->i_mutex); 2875 mutex_lock(&inode->i_mutex);
2791 mutex_lock(&cgroup_mutex); 2876 mutex_lock(&cgroup_mutex);
2792 if (!cgroup_is_removed(cgrp)) 2877 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2793 cgroup_addrm_files(cgrp, ss, cfts, is_add); 2878 cgroup_addrm_files(cgrp, ss, cfts, is_add);
2794 mutex_unlock(&cgroup_mutex); 2879 mutex_unlock(&cgroup_mutex);
2795 mutex_unlock(&inode->i_mutex); 2880 mutex_unlock(&inode->i_mutex);
2796 2881
2797 list_del_init(&cgrp->cft_q_node); 2882 rcu_read_lock();
2798 dput(cgrp->dentry);
2799 } 2883 }
2800 2884 rcu_read_unlock();
2801 mutex_unlock(&cgroup_cft_mutex); 2885 dput(prev);
2886 deactivate_super(sb);
2802} 2887}
2803 2888
2804/** 2889/**
@@ -2853,7 +2938,8 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2853 2938
2854 list_for_each_entry(set, &ss->cftsets, node) { 2939 list_for_each_entry(set, &ss->cftsets, node) {
2855 if (set->cfts == cfts) { 2940 if (set->cfts == cfts) {
2856 list_del_init(&set->node); 2941 list_del(&set->node);
2942 kfree(set);
2857 cgroup_cfts_commit(ss, cfts, false); 2943 cgroup_cfts_commit(ss, cfts, false);
2858 return 0; 2944 return 0;
2859 } 2945 }
@@ -2872,12 +2958,11 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2872int cgroup_task_count(const struct cgroup *cgrp) 2958int cgroup_task_count(const struct cgroup *cgrp)
2873{ 2959{
2874 int count = 0; 2960 int count = 0;
2875 struct cg_cgroup_link *link; 2961 struct cgrp_cset_link *link;
2876 2962
2877 read_lock(&css_set_lock); 2963 read_lock(&css_set_lock);
2878 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { 2964 list_for_each_entry(link, &cgrp->cset_links, cset_link)
2879 count += atomic_read(&link->cg->refcount); 2965 count += atomic_read(&link->cset->refcount);
2880 }
2881 read_unlock(&css_set_lock); 2966 read_unlock(&css_set_lock);
2882 return count; 2967 return count;
2883} 2968}
@@ -2886,25 +2971,24 @@ int cgroup_task_count(const struct cgroup *cgrp)
2886 * Advance a list_head iterator. The iterator should be positioned at 2971 * Advance a list_head iterator. The iterator should be positioned at
2887 * the start of a css_set 2972 * the start of a css_set
2888 */ 2973 */
2889static void cgroup_advance_iter(struct cgroup *cgrp, 2974static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it)
2890 struct cgroup_iter *it)
2891{ 2975{
2892 struct list_head *l = it->cg_link; 2976 struct list_head *l = it->cset_link;
2893 struct cg_cgroup_link *link; 2977 struct cgrp_cset_link *link;
2894 struct css_set *cg; 2978 struct css_set *cset;
2895 2979
2896 /* Advance to the next non-empty css_set */ 2980 /* Advance to the next non-empty css_set */
2897 do { 2981 do {
2898 l = l->next; 2982 l = l->next;
2899 if (l == &cgrp->css_sets) { 2983 if (l == &cgrp->cset_links) {
2900 it->cg_link = NULL; 2984 it->cset_link = NULL;
2901 return; 2985 return;
2902 } 2986 }
2903 link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); 2987 link = list_entry(l, struct cgrp_cset_link, cset_link);
2904 cg = link->cg; 2988 cset = link->cset;
2905 } while (list_empty(&cg->tasks)); 2989 } while (list_empty(&cset->tasks));
2906 it->cg_link = l; 2990 it->cset_link = l;
2907 it->task = cg->tasks.next; 2991 it->task = cset->tasks.next;
2908} 2992}
2909 2993
2910/* 2994/*
@@ -2934,7 +3018,7 @@ static void cgroup_enable_task_cg_lists(void)
2934 * entry won't be deleted though the process has exited. 3018 * entry won't be deleted though the process has exited.
2935 */ 3019 */
2936 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) 3020 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2937 list_add(&p->cg_list, &p->cgroups->tasks); 3021 list_add(&p->cg_list, &task_css_set(p)->tasks);
2938 task_unlock(p); 3022 task_unlock(p);
2939 } while_each_thread(g, p); 3023 } while_each_thread(g, p);
2940 read_unlock(&tasklist_lock); 3024 read_unlock(&tasklist_lock);
@@ -2942,12 +3026,67 @@ static void cgroup_enable_task_cg_lists(void)
2942} 3026}
2943 3027
2944/** 3028/**
3029 * cgroup_next_sibling - find the next sibling of a given cgroup
3030 * @pos: the current cgroup
3031 *
3032 * This function returns the next sibling of @pos and should be called
3033 * under RCU read lock. The only requirement is that @pos is accessible.
3034 * The next sibling is guaranteed to be returned regardless of @pos's
3035 * state.
3036 */
3037struct cgroup *cgroup_next_sibling(struct cgroup *pos)
3038{
3039 struct cgroup *next;
3040
3041 WARN_ON_ONCE(!rcu_read_lock_held());
3042
3043 /*
3044 * @pos could already have been removed. Once a cgroup is removed,
3045 * its ->sibling.next is no longer updated when its next sibling
3046 * changes. As CGRP_DEAD assertion is serialized and happens
3047 * before the cgroup is taken off the ->sibling list, if we see it
3048 * unasserted, it's guaranteed that the next sibling hasn't
3049 * finished its grace period even if it's already removed, and thus
3050 * safe to dereference from this RCU critical section. If
3051 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
3052 * to be visible as %true here.
3053 */
3054 if (likely(!cgroup_is_dead(pos))) {
3055 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3056 if (&next->sibling != &pos->parent->children)
3057 return next;
3058 return NULL;
3059 }
3060
3061 /*
3062 * Can't dereference the next pointer. Each cgroup is given a
3063 * monotonically increasing unique serial number and always
3064 * appended to the sibling list, so the next one can be found by
3065 * walking the parent's children until we see a cgroup with higher
3066 * serial number than @pos's.
3067 *
3068 * While this path can be slow, it's taken only when either the
3069 * current cgroup is removed or iteration and removal race.
3070 */
3071 list_for_each_entry_rcu(next, &pos->parent->children, sibling)
3072 if (next->serial_nr > pos->serial_nr)
3073 return next;
3074 return NULL;
3075}
3076EXPORT_SYMBOL_GPL(cgroup_next_sibling);
3077
3078/**
2945 * cgroup_next_descendant_pre - find the next descendant for pre-order walk 3079 * cgroup_next_descendant_pre - find the next descendant for pre-order walk
2946 * @pos: the current position (%NULL to initiate traversal) 3080 * @pos: the current position (%NULL to initiate traversal)
2947 * @cgroup: cgroup whose descendants to walk 3081 * @cgroup: cgroup whose descendants to walk
2948 * 3082 *
2949 * To be used by cgroup_for_each_descendant_pre(). Find the next 3083 * To be used by cgroup_for_each_descendant_pre(). Find the next
2950 * descendant to visit for pre-order traversal of @cgroup's descendants. 3084 * descendant to visit for pre-order traversal of @cgroup's descendants.
3085 *
3086 * While this function requires RCU read locking, it doesn't require the
3087 * whole traversal to be contained in a single RCU critical section. This
3088 * function will return the correct next descendant as long as both @pos
3089 * and @cgroup are accessible and @pos is a descendant of @cgroup.
2951 */ 3090 */
2952struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, 3091struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2953 struct cgroup *cgroup) 3092 struct cgroup *cgroup)
@@ -2967,11 +3106,9 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2967 3106
2968 /* no child, visit my or the closest ancestor's next sibling */ 3107 /* no child, visit my or the closest ancestor's next sibling */
2969 while (pos != cgroup) { 3108 while (pos != cgroup) {
2970 next = list_entry_rcu(pos->sibling.next, struct cgroup, 3109 next = cgroup_next_sibling(pos);
2971 sibling); 3110 if (next)
2972 if (&next->sibling != &pos->parent->children)
2973 return next; 3111 return next;
2974
2975 pos = pos->parent; 3112 pos = pos->parent;
2976 } 3113 }
2977 3114
@@ -2986,6 +3123,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
2986 * Return the rightmost descendant of @pos. If there's no descendant, 3123 * Return the rightmost descendant of @pos. If there's no descendant,
2987 * @pos is returned. This can be used during pre-order traversal to skip 3124 * @pos is returned. This can be used during pre-order traversal to skip
2988 * subtree of @pos. 3125 * subtree of @pos.
3126 *
3127 * While this function requires RCU read locking, it doesn't require the
3128 * whole traversal to be contained in a single RCU critical section. This
3129 * function will return the correct rightmost descendant as long as @pos is
3130 * accessible.
2989 */ 3131 */
2990struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) 3132struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
2991{ 3133{
@@ -3025,6 +3167,11 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
3025 * 3167 *
3026 * To be used by cgroup_for_each_descendant_post(). Find the next 3168 * To be used by cgroup_for_each_descendant_post(). Find the next
3027 * descendant to visit for post-order traversal of @cgroup's descendants. 3169 * descendant to visit for post-order traversal of @cgroup's descendants.
3170 *
3171 * While this function requires RCU read locking, it doesn't require the
3172 * whole traversal to be contained in a single RCU critical section. This
3173 * function will return the correct next descendant as long as both @pos
3174 * and @cgroup are accessible and @pos is a descendant of @cgroup.
3028 */ 3175 */
3029struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, 3176struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3030 struct cgroup *cgroup) 3177 struct cgroup *cgroup)
@@ -3040,8 +3187,8 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3040 } 3187 }
3041 3188
3042 /* if there's an unvisited sibling, visit its leftmost descendant */ 3189 /* if there's an unvisited sibling, visit its leftmost descendant */
3043 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 3190 next = cgroup_next_sibling(pos);
3044 if (&next->sibling != &pos->parent->children) 3191 if (next)
3045 return cgroup_leftmost_descendant(next); 3192 return cgroup_leftmost_descendant(next);
3046 3193
3047 /* no sibling left, visit parent */ 3194 /* no sibling left, visit parent */
@@ -3062,7 +3209,7 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
3062 cgroup_enable_task_cg_lists(); 3209 cgroup_enable_task_cg_lists();
3063 3210
3064 read_lock(&css_set_lock); 3211 read_lock(&css_set_lock);
3065 it->cg_link = &cgrp->css_sets; 3212 it->cset_link = &cgrp->cset_links;
3066 cgroup_advance_iter(cgrp, it); 3213 cgroup_advance_iter(cgrp, it);
3067} 3214}
3068 3215
@@ -3071,16 +3218,16 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
3071{ 3218{
3072 struct task_struct *res; 3219 struct task_struct *res;
3073 struct list_head *l = it->task; 3220 struct list_head *l = it->task;
3074 struct cg_cgroup_link *link; 3221 struct cgrp_cset_link *link;
3075 3222
3076 /* If the iterator cg is NULL, we have no tasks */ 3223 /* If the iterator cg is NULL, we have no tasks */
3077 if (!it->cg_link) 3224 if (!it->cset_link)
3078 return NULL; 3225 return NULL;
3079 res = list_entry(l, struct task_struct, cg_list); 3226 res = list_entry(l, struct task_struct, cg_list);
3080 /* Advance iterator to find next entry */ 3227 /* Advance iterator to find next entry */
3081 l = l->next; 3228 l = l->next;
3082 link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); 3229 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
3083 if (l == &link->cg->tasks) { 3230 if (l == &link->cset->tasks) {
3084 /* We reached the end of this task list - move on to 3231 /* We reached the end of this task list - move on to
3085 * the next cg_cgroup_link */ 3232 * the next cg_cgroup_link */
3086 cgroup_advance_iter(cgrp, it); 3233 cgroup_advance_iter(cgrp, it);
@@ -3411,7 +3558,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3411 } 3558 }
3412 } 3559 }
3413 /* entry not found; create a new one */ 3560 /* entry not found; create a new one */
3414 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 3561 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3415 if (!l) { 3562 if (!l) {
3416 mutex_unlock(&cgrp->pidlist_mutex); 3563 mutex_unlock(&cgrp->pidlist_mutex);
3417 return l; 3564 return l;
@@ -3420,8 +3567,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3420 down_write(&l->mutex); 3567 down_write(&l->mutex);
3421 l->key.type = type; 3568 l->key.type = type;
3422 l->key.ns = get_pid_ns(ns); 3569 l->key.ns = get_pid_ns(ns);
3423 l->use_count = 0; /* don't increment here */
3424 l->list = NULL;
3425 l->owner = cgrp; 3570 l->owner = cgrp;
3426 list_add(&l->links, &cgrp->pidlists); 3571 list_add(&l->links, &cgrp->pidlists);
3427 mutex_unlock(&cgrp->pidlist_mutex); 3572 mutex_unlock(&cgrp->pidlist_mutex);
@@ -3727,6 +3872,23 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
3727} 3872}
3728 3873
3729/* 3874/*
3875 * When dput() is called asynchronously, if umount has been done and
3876 * then deactivate_super() in cgroup_free_fn() kills the superblock,
3877 * there's a small window that vfs will see the root dentry with non-zero
3878 * refcnt and trigger BUG().
3879 *
3880 * That's why we hold a reference before dput() and drop it right after.
3881 */
3882static void cgroup_dput(struct cgroup *cgrp)
3883{
3884 struct super_block *sb = cgrp->root->sb;
3885
3886 atomic_inc(&sb->s_active);
3887 dput(cgrp->dentry);
3888 deactivate_super(sb);
3889}
3890
3891/*
3730 * Unregister event and free resources. 3892 * Unregister event and free resources.
3731 * 3893 *
3732 * Gets called from workqueue. 3894 * Gets called from workqueue.
@@ -3746,7 +3908,7 @@ static void cgroup_event_remove(struct work_struct *work)
3746 3908
3747 eventfd_ctx_put(event->eventfd); 3909 eventfd_ctx_put(event->eventfd);
3748 kfree(event); 3910 kfree(event);
3749 dput(cgrp->dentry); 3911 cgroup_dput(cgrp);
3750} 3912}
3751 3913
3752/* 3914/*
@@ -3933,33 +4095,16 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
3933 return 0; 4095 return 0;
3934} 4096}
3935 4097
3936/* 4098static struct cftype cgroup_base_files[] = {
3937 * for the common functions, 'private' gives the type of file
3938 */
3939/* for hysterical raisins, we can't put this on the older files */
3940#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
3941static struct cftype files[] = {
3942 {
3943 .name = "tasks",
3944 .open = cgroup_tasks_open,
3945 .write_u64 = cgroup_tasks_write,
3946 .release = cgroup_pidlist_release,
3947 .mode = S_IRUGO | S_IWUSR,
3948 },
3949 { 4099 {
3950 .name = CGROUP_FILE_GENERIC_PREFIX "procs", 4100 .name = "cgroup.procs",
3951 .open = cgroup_procs_open, 4101 .open = cgroup_procs_open,
3952 .write_u64 = cgroup_procs_write, 4102 .write_u64 = cgroup_procs_write,
3953 .release = cgroup_pidlist_release, 4103 .release = cgroup_pidlist_release,
3954 .mode = S_IRUGO | S_IWUSR, 4104 .mode = S_IRUGO | S_IWUSR,
3955 }, 4105 },
3956 { 4106 {
3957 .name = "notify_on_release", 4107 .name = "cgroup.event_control",
3958 .read_u64 = cgroup_read_notify_on_release,
3959 .write_u64 = cgroup_write_notify_on_release,
3960 },
3961 {
3962 .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3963 .write_string = cgroup_write_event_control, 4108 .write_string = cgroup_write_event_control,
3964 .mode = S_IWUGO, 4109 .mode = S_IWUGO,
3965 }, 4110 },
@@ -3974,9 +4119,29 @@ static struct cftype files[] = {
3974 .flags = CFTYPE_ONLY_ON_ROOT, 4119 .flags = CFTYPE_ONLY_ON_ROOT,
3975 .read_seq_string = cgroup_sane_behavior_show, 4120 .read_seq_string = cgroup_sane_behavior_show,
3976 }, 4121 },
4122
4123 /*
4124 * Historical crazy stuff. These don't have "cgroup." prefix and
4125 * don't exist if sane_behavior. If you're depending on these, be
4126 * prepared to be burned.
4127 */
4128 {
4129 .name = "tasks",
4130 .flags = CFTYPE_INSANE, /* use "procs" instead */
4131 .open = cgroup_tasks_open,
4132 .write_u64 = cgroup_tasks_write,
4133 .release = cgroup_pidlist_release,
4134 .mode = S_IRUGO | S_IWUSR,
4135 },
4136 {
4137 .name = "notify_on_release",
4138 .flags = CFTYPE_INSANE,
4139 .read_u64 = cgroup_read_notify_on_release,
4140 .write_u64 = cgroup_write_notify_on_release,
4141 },
3977 { 4142 {
3978 .name = "release_agent", 4143 .name = "release_agent",
3979 .flags = CFTYPE_ONLY_ON_ROOT, 4144 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3980 .read_seq_string = cgroup_release_agent_show, 4145 .read_seq_string = cgroup_release_agent_show,
3981 .write_string = cgroup_release_agent_write, 4146 .write_string = cgroup_release_agent_write,
3982 .max_write_len = PATH_MAX, 4147 .max_write_len = PATH_MAX,
@@ -3997,13 +4162,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
3997 struct cgroup_subsys *ss; 4162 struct cgroup_subsys *ss;
3998 4163
3999 if (base_files) { 4164 if (base_files) {
4000 err = cgroup_addrm_files(cgrp, NULL, files, true); 4165 err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true);
4001 if (err < 0) 4166 if (err < 0)
4002 return err; 4167 return err;
4003 } 4168 }
4004 4169
4005 /* process cftsets of each subsystem */ 4170 /* process cftsets of each subsystem */
4006 for_each_subsys(cgrp->root, ss) { 4171 for_each_root_subsys(cgrp->root, ss) {
4007 struct cftype_set *set; 4172 struct cftype_set *set;
4008 if (!test_bit(ss->subsys_id, &subsys_mask)) 4173 if (!test_bit(ss->subsys_id, &subsys_mask))
4009 continue; 4174 continue;
@@ -4013,15 +4178,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
4013 } 4178 }
4014 4179
4015 /* This cgroup is ready now */ 4180 /* This cgroup is ready now */
4016 for_each_subsys(cgrp->root, ss) { 4181 for_each_root_subsys(cgrp->root, ss) {
4017 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4182 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4183 struct css_id *id = rcu_dereference_protected(css->id, true);
4184
4018 /* 4185 /*
4019 * Update id->css pointer and make this css visible from 4186 * Update id->css pointer and make this css visible from
4020 * CSS ID functions. This pointer will be dereferened 4187 * CSS ID functions. This pointer will be dereferened
4021 * from RCU-read-side without locks. 4188 * from RCU-read-side without locks.
4022 */ 4189 */
4023 if (css->id) 4190 if (id)
4024 rcu_assign_pointer(css->id->css, css); 4191 rcu_assign_pointer(id->css, css);
4025 } 4192 }
4026 4193
4027 return 0; 4194 return 0;
@@ -4031,12 +4198,16 @@ static void css_dput_fn(struct work_struct *work)
4031{ 4198{
4032 struct cgroup_subsys_state *css = 4199 struct cgroup_subsys_state *css =
4033 container_of(work, struct cgroup_subsys_state, dput_work); 4200 container_of(work, struct cgroup_subsys_state, dput_work);
4034 struct dentry *dentry = css->cgroup->dentry;
4035 struct super_block *sb = dentry->d_sb;
4036 4201
4037 atomic_inc(&sb->s_active); 4202 cgroup_dput(css->cgroup);
4038 dput(dentry); 4203}
4039 deactivate_super(sb); 4204
4205static void css_release(struct percpu_ref *ref)
4206{
4207 struct cgroup_subsys_state *css =
4208 container_of(ref, struct cgroup_subsys_state, refcnt);
4209
4210 schedule_work(&css->dput_work);
4040} 4211}
4041 4212
4042static void init_cgroup_css(struct cgroup_subsys_state *css, 4213static void init_cgroup_css(struct cgroup_subsys_state *css,
@@ -4044,10 +4215,9 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
4044 struct cgroup *cgrp) 4215 struct cgroup *cgrp)
4045{ 4216{
4046 css->cgroup = cgrp; 4217 css->cgroup = cgrp;
4047 atomic_set(&css->refcnt, 1);
4048 css->flags = 0; 4218 css->flags = 0;
4049 css->id = NULL; 4219 css->id = NULL;
4050 if (cgrp == dummytop) 4220 if (cgrp == cgroup_dummy_top)
4051 css->flags |= CSS_ROOT; 4221 css->flags |= CSS_ROOT;
4052 BUG_ON(cgrp->subsys[ss->subsys_id]); 4222 BUG_ON(cgrp->subsys[ss->subsys_id]);
4053 cgrp->subsys[ss->subsys_id] = css; 4223 cgrp->subsys[ss->subsys_id] = css;
@@ -4157,7 +4327,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4157 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 4327 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4158 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4328 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4159 4329
4160 for_each_subsys(root, ss) { 4330 for_each_root_subsys(root, ss) {
4161 struct cgroup_subsys_state *css; 4331 struct cgroup_subsys_state *css;
4162 4332
4163 css = ss->css_alloc(cgrp); 4333 css = ss->css_alloc(cgrp);
@@ -4165,7 +4335,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4165 err = PTR_ERR(css); 4335 err = PTR_ERR(css);
4166 goto err_free_all; 4336 goto err_free_all;
4167 } 4337 }
4338
4339 err = percpu_ref_init(&css->refcnt, css_release);
4340 if (err)
4341 goto err_free_all;
4342
4168 init_cgroup_css(css, ss, cgrp); 4343 init_cgroup_css(css, ss, cgrp);
4344
4169 if (ss->use_id) { 4345 if (ss->use_id) {
4170 err = alloc_css_id(ss, parent, cgrp); 4346 err = alloc_css_id(ss, parent, cgrp);
4171 if (err) 4347 if (err)
@@ -4183,20 +4359,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4183 goto err_free_all; 4359 goto err_free_all;
4184 lockdep_assert_held(&dentry->d_inode->i_mutex); 4360 lockdep_assert_held(&dentry->d_inode->i_mutex);
4185 4361
4362 cgrp->serial_nr = cgroup_serial_nr_next++;
4363
4186 /* allocation complete, commit to creation */ 4364 /* allocation complete, commit to creation */
4187 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4188 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4365 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4189 root->number_of_cgroups++; 4366 root->number_of_cgroups++;
4190 4367
4191 /* each css holds a ref to the cgroup's dentry */ 4368 /* each css holds a ref to the cgroup's dentry */
4192 for_each_subsys(root, ss) 4369 for_each_root_subsys(root, ss)
4193 dget(dentry); 4370 dget(dentry);
4194 4371
4195 /* hold a ref to the parent's dentry */ 4372 /* hold a ref to the parent's dentry */
4196 dget(parent->dentry); 4373 dget(parent->dentry);
4197 4374
4198 /* creation succeeded, notify subsystems */ 4375 /* creation succeeded, notify subsystems */
4199 for_each_subsys(root, ss) { 4376 for_each_root_subsys(root, ss) {
4200 err = online_css(ss, cgrp); 4377 err = online_css(ss, cgrp);
4201 if (err) 4378 if (err)
4202 goto err_destroy; 4379 goto err_destroy;
@@ -4221,9 +4398,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4221 return 0; 4398 return 0;
4222 4399
4223err_free_all: 4400err_free_all:
4224 for_each_subsys(root, ss) { 4401 for_each_root_subsys(root, ss) {
4225 if (cgrp->subsys[ss->subsys_id]) 4402 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4403
4404 if (css) {
4405 percpu_ref_cancel_init(&css->refcnt);
4226 ss->css_free(cgrp); 4406 ss->css_free(cgrp);
4407 }
4227 } 4408 }
4228 mutex_unlock(&cgroup_mutex); 4409 mutex_unlock(&cgroup_mutex);
4229 /* Release the reference count that we took on the superblock */ 4410 /* Release the reference count that we took on the superblock */
@@ -4251,63 +4432,120 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4251 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4432 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4252} 4433}
4253 4434
4435static void cgroup_css_killed(struct cgroup *cgrp)
4436{
4437 if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
4438 return;
4439
4440 /* percpu ref's of all css's are killed, kick off the next step */
4441 INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
4442 schedule_work(&cgrp->destroy_work);
4443}
4444
4445static void css_ref_killed_fn(struct percpu_ref *ref)
4446{
4447 struct cgroup_subsys_state *css =
4448 container_of(ref, struct cgroup_subsys_state, refcnt);
4449
4450 cgroup_css_killed(css->cgroup);
4451}
4452
4453/**
4454 * cgroup_destroy_locked - the first stage of cgroup destruction
4455 * @cgrp: cgroup to be destroyed
4456 *
4457 * css's make use of percpu refcnts whose killing latency shouldn't be
4458 * exposed to userland and are RCU protected. Also, cgroup core needs to
4459 * guarantee that css_tryget() won't succeed by the time ->css_offline() is
4460 * invoked. To satisfy all the requirements, destruction is implemented in
4461 * the following two steps.
4462 *
4463 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
4464 * userland visible parts and start killing the percpu refcnts of
4465 * css's. Set up so that the next stage will be kicked off once all
4466 * the percpu refcnts are confirmed to be killed.
4467 *
4468 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
4469 * rest of destruction. Once all cgroup references are gone, the
4470 * cgroup is RCU-freed.
4471 *
4472 * This function implements s1. After this step, @cgrp is gone as far as
4473 * the userland is concerned and a new cgroup with the same name may be
4474 * created. As cgroup doesn't care about the names internally, this
4475 * doesn't cause any problem.
4476 */
4254static int cgroup_destroy_locked(struct cgroup *cgrp) 4477static int cgroup_destroy_locked(struct cgroup *cgrp)
4255 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4478 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4256{ 4479{
4257 struct dentry *d = cgrp->dentry; 4480 struct dentry *d = cgrp->dentry;
4258 struct cgroup *parent = cgrp->parent;
4259 struct cgroup_event *event, *tmp; 4481 struct cgroup_event *event, *tmp;
4260 struct cgroup_subsys *ss; 4482 struct cgroup_subsys *ss;
4483 bool empty;
4261 4484
4262 lockdep_assert_held(&d->d_inode->i_mutex); 4485 lockdep_assert_held(&d->d_inode->i_mutex);
4263 lockdep_assert_held(&cgroup_mutex); 4486 lockdep_assert_held(&cgroup_mutex);
4264 4487
4265 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) 4488 /*
4489 * css_set_lock synchronizes access to ->cset_links and prevents
4490 * @cgrp from being removed while __put_css_set() is in progress.
4491 */
4492 read_lock(&css_set_lock);
4493 empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children);
4494 read_unlock(&css_set_lock);
4495 if (!empty)
4266 return -EBUSY; 4496 return -EBUSY;
4267 4497
4268 /* 4498 /*
4269 * Block new css_tryget() by deactivating refcnt and mark @cgrp 4499 * Block new css_tryget() by killing css refcnts. cgroup core
4270 * removed. This makes future css_tryget() and child creation 4500 * guarantees that, by the time ->css_offline() is invoked, no new
4271 * attempts fail thus maintaining the removal conditions verified 4501 * css reference will be given out via css_tryget(). We can't
4272 * above. 4502 * simply call percpu_ref_kill() and proceed to offlining css's
4503 * because percpu_ref_kill() doesn't guarantee that the ref is seen
4504 * as killed on all CPUs on return.
4505 *
4506 * Use percpu_ref_kill_and_confirm() to get notifications as each
4507 * css is confirmed to be seen as killed on all CPUs. The
4508 * notification callback keeps track of the number of css's to be
4509 * killed and schedules cgroup_offline_fn() to perform the rest of
4510 * destruction once the percpu refs of all css's are confirmed to
4511 * be killed.
4273 */ 4512 */
4274 for_each_subsys(cgrp->root, ss) { 4513 atomic_set(&cgrp->css_kill_cnt, 1);
4514 for_each_root_subsys(cgrp->root, ss) {
4275 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4515 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4276 4516
4277 WARN_ON(atomic_read(&css->refcnt) < 0); 4517 /*
4278 atomic_add(CSS_DEACT_BIAS, &css->refcnt); 4518 * Killing would put the base ref, but we need to keep it
4279 } 4519 * alive until after ->css_offline.
4280 set_bit(CGRP_REMOVED, &cgrp->flags); 4520 */
4521 percpu_ref_get(&css->refcnt);
4281 4522
4282 /* tell subsystems to initate destruction */ 4523 atomic_inc(&cgrp->css_kill_cnt);
4283 for_each_subsys(cgrp->root, ss) 4524 percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
4284 offline_css(ss, cgrp); 4525 }
4526 cgroup_css_killed(cgrp);
4285 4527
4286 /* 4528 /*
4287 * Put all the base refs. Each css holds an extra reference to the 4529 * Mark @cgrp dead. This prevents further task migration and child
4288 * cgroup's dentry and cgroup removal proceeds regardless of css 4530 * creation by disabling cgroup_lock_live_group(). Note that
4289 * refs. On the last put of each css, whenever that may be, the 4531 * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to
4290 * extra dentry ref is put so that dentry destruction happens only 4532 * resume iteration after dropping RCU read lock. See
4291 * after all css's are released. 4533 * cgroup_next_sibling() for details.
4292 */ 4534 */
4293 for_each_subsys(cgrp->root, ss) 4535 set_bit(CGRP_DEAD, &cgrp->flags);
4294 css_put(cgrp->subsys[ss->subsys_id]);
4295 4536
4537 /* CGRP_DEAD is set, remove from ->release_list for the last time */
4296 raw_spin_lock(&release_list_lock); 4538 raw_spin_lock(&release_list_lock);
4297 if (!list_empty(&cgrp->release_list)) 4539 if (!list_empty(&cgrp->release_list))
4298 list_del_init(&cgrp->release_list); 4540 list_del_init(&cgrp->release_list);
4299 raw_spin_unlock(&release_list_lock); 4541 raw_spin_unlock(&release_list_lock);
4300 4542
4301 /* delete this cgroup from parent->children */ 4543 /*
4302 list_del_rcu(&cgrp->sibling); 4544 * Remove @cgrp directory. The removal puts the base ref but we
4303 list_del_init(&cgrp->allcg_node); 4545 * aren't quite done with @cgrp yet, so hold onto it.
4304 4546 */
4305 dget(d); 4547 dget(d);
4306 cgroup_d_remove_dir(d); 4548 cgroup_d_remove_dir(d);
4307 dput(d);
4308
4309 set_bit(CGRP_RELEASABLE, &parent->flags);
4310 check_for_release(parent);
4311 4549
4312 /* 4550 /*
4313 * Unregister events and notify userspace. 4551 * Unregister events and notify userspace.
@@ -4322,6 +4560,53 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4322 spin_unlock(&cgrp->event_list_lock); 4560 spin_unlock(&cgrp->event_list_lock);
4323 4561
4324 return 0; 4562 return 0;
4563};
4564
4565/**
4566 * cgroup_offline_fn - the second step of cgroup destruction
4567 * @work: cgroup->destroy_free_work
4568 *
4569 * This function is invoked from a work item for a cgroup which is being
4570 * destroyed after the percpu refcnts of all css's are guaranteed to be
4571 * seen as killed on all CPUs, and performs the rest of destruction. This
4572 * is the second step of destruction described in the comment above
4573 * cgroup_destroy_locked().
4574 */
4575static void cgroup_offline_fn(struct work_struct *work)
4576{
4577 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
4578 struct cgroup *parent = cgrp->parent;
4579 struct dentry *d = cgrp->dentry;
4580 struct cgroup_subsys *ss;
4581
4582 mutex_lock(&cgroup_mutex);
4583
4584 /*
4585 * css_tryget() is guaranteed to fail now. Tell subsystems to
4586 * initate destruction.
4587 */
4588 for_each_root_subsys(cgrp->root, ss)
4589 offline_css(ss, cgrp);
4590
4591 /*
4592 * Put the css refs from cgroup_destroy_locked(). Each css holds
4593 * an extra reference to the cgroup's dentry and cgroup removal
4594 * proceeds regardless of css refs. On the last put of each css,
4595 * whenever that may be, the extra dentry ref is put so that dentry
4596 * destruction happens only after all css's are released.
4597 */
4598 for_each_root_subsys(cgrp->root, ss)
4599 css_put(cgrp->subsys[ss->subsys_id]);
4600
4601 /* delete this cgroup from parent->children */
4602 list_del_rcu(&cgrp->sibling);
4603
4604 dput(d);
4605
4606 set_bit(CGRP_RELEASABLE, &parent->flags);
4607 check_for_release(parent);
4608
4609 mutex_unlock(&cgroup_mutex);
4325} 4610}
4326 4611
4327static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4612static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -4361,12 +4646,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4361 cgroup_init_cftsets(ss); 4646 cgroup_init_cftsets(ss);
4362 4647
4363 /* Create the top cgroup state for this subsystem */ 4648 /* Create the top cgroup state for this subsystem */
4364 list_add(&ss->sibling, &rootnode.subsys_list); 4649 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4365 ss->root = &rootnode; 4650 ss->root = &cgroup_dummy_root;
4366 css = ss->css_alloc(dummytop); 4651 css = ss->css_alloc(cgroup_dummy_top);
4367 /* We don't handle early failures gracefully */ 4652 /* We don't handle early failures gracefully */
4368 BUG_ON(IS_ERR(css)); 4653 BUG_ON(IS_ERR(css));
4369 init_cgroup_css(css, ss, dummytop); 4654 init_cgroup_css(css, ss, cgroup_dummy_top);
4370 4655
4371 /* Update the init_css_set to contain a subsys 4656 /* Update the init_css_set to contain a subsys
4372 * pointer to this state - since the subsystem is 4657 * pointer to this state - since the subsystem is
@@ -4381,7 +4666,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4381 * need to invoke fork callbacks here. */ 4666 * need to invoke fork callbacks here. */
4382 BUG_ON(!list_empty(&init_task.tasks)); 4667 BUG_ON(!list_empty(&init_task.tasks));
4383 4668
4384 BUG_ON(online_css(ss, dummytop)); 4669 BUG_ON(online_css(ss, cgroup_dummy_top));
4385 4670
4386 mutex_unlock(&cgroup_mutex); 4671 mutex_unlock(&cgroup_mutex);
4387 4672
@@ -4404,7 +4689,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4404 struct cgroup_subsys_state *css; 4689 struct cgroup_subsys_state *css;
4405 int i, ret; 4690 int i, ret;
4406 struct hlist_node *tmp; 4691 struct hlist_node *tmp;
4407 struct css_set *cg; 4692 struct css_set *cset;
4408 unsigned long key; 4693 unsigned long key;
4409 4694
4410 /* check name and function validity */ 4695 /* check name and function validity */
@@ -4427,7 +4712,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4427 */ 4712 */
4428 if (ss->module == NULL) { 4713 if (ss->module == NULL) {
4429 /* a sanity check */ 4714 /* a sanity check */
4430 BUG_ON(subsys[ss->subsys_id] != ss); 4715 BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
4431 return 0; 4716 return 0;
4432 } 4717 }
4433 4718
@@ -4435,26 +4720,26 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4435 cgroup_init_cftsets(ss); 4720 cgroup_init_cftsets(ss);
4436 4721
4437 mutex_lock(&cgroup_mutex); 4722 mutex_lock(&cgroup_mutex);
4438 subsys[ss->subsys_id] = ss; 4723 cgroup_subsys[ss->subsys_id] = ss;
4439 4724
4440 /* 4725 /*
4441 * no ss->css_alloc seems to need anything important in the ss 4726 * no ss->css_alloc seems to need anything important in the ss
4442 * struct, so this can happen first (i.e. before the rootnode 4727 * struct, so this can happen first (i.e. before the dummy root
4443 * attachment). 4728 * attachment).
4444 */ 4729 */
4445 css = ss->css_alloc(dummytop); 4730 css = ss->css_alloc(cgroup_dummy_top);
4446 if (IS_ERR(css)) { 4731 if (IS_ERR(css)) {
4447 /* failure case - need to deassign the subsys[] slot. */ 4732 /* failure case - need to deassign the cgroup_subsys[] slot. */
4448 subsys[ss->subsys_id] = NULL; 4733 cgroup_subsys[ss->subsys_id] = NULL;
4449 mutex_unlock(&cgroup_mutex); 4734 mutex_unlock(&cgroup_mutex);
4450 return PTR_ERR(css); 4735 return PTR_ERR(css);
4451 } 4736 }
4452 4737
4453 list_add(&ss->sibling, &rootnode.subsys_list); 4738 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4454 ss->root = &rootnode; 4739 ss->root = &cgroup_dummy_root;
4455 4740
4456 /* our new subsystem will be attached to the dummy hierarchy. */ 4741 /* our new subsystem will be attached to the dummy hierarchy. */
4457 init_cgroup_css(css, ss, dummytop); 4742 init_cgroup_css(css, ss, cgroup_dummy_top);
4458 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4743 /* init_idr must be after init_cgroup_css because it sets css->id. */
4459 if (ss->use_id) { 4744 if (ss->use_id) {
4460 ret = cgroup_init_idr(ss, css); 4745 ret = cgroup_init_idr(ss, css);
@@ -4471,21 +4756,21 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4471 * this is all done under the css_set_lock. 4756 * this is all done under the css_set_lock.
4472 */ 4757 */
4473 write_lock(&css_set_lock); 4758 write_lock(&css_set_lock);
4474 hash_for_each_safe(css_set_table, i, tmp, cg, hlist) { 4759 hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
4475 /* skip entries that we already rehashed */ 4760 /* skip entries that we already rehashed */
4476 if (cg->subsys[ss->subsys_id]) 4761 if (cset->subsys[ss->subsys_id])
4477 continue; 4762 continue;
4478 /* remove existing entry */ 4763 /* remove existing entry */
4479 hash_del(&cg->hlist); 4764 hash_del(&cset->hlist);
4480 /* set new value */ 4765 /* set new value */
4481 cg->subsys[ss->subsys_id] = css; 4766 cset->subsys[ss->subsys_id] = css;
4482 /* recompute hash and restore entry */ 4767 /* recompute hash and restore entry */
4483 key = css_set_hash(cg->subsys); 4768 key = css_set_hash(cset->subsys);
4484 hash_add(css_set_table, &cg->hlist, key); 4769 hash_add(css_set_table, &cset->hlist, key);
4485 } 4770 }
4486 write_unlock(&css_set_lock); 4771 write_unlock(&css_set_lock);
4487 4772
4488 ret = online_css(ss, dummytop); 4773 ret = online_css(ss, cgroup_dummy_top);
4489 if (ret) 4774 if (ret)
4490 goto err_unload; 4775 goto err_unload;
4491 4776
@@ -4511,7 +4796,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4511 */ 4796 */
4512void cgroup_unload_subsys(struct cgroup_subsys *ss) 4797void cgroup_unload_subsys(struct cgroup_subsys *ss)
4513{ 4798{
4514 struct cg_cgroup_link *link; 4799 struct cgrp_cset_link *link;
4515 4800
4516 BUG_ON(ss->module == NULL); 4801 BUG_ON(ss->module == NULL);
4517 4802
@@ -4520,45 +4805,46 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4520 * try_module_get in parse_cgroupfs_options should ensure that it 4805 * try_module_get in parse_cgroupfs_options should ensure that it
4521 * doesn't start being used while we're killing it off. 4806 * doesn't start being used while we're killing it off.
4522 */ 4807 */
4523 BUG_ON(ss->root != &rootnode); 4808 BUG_ON(ss->root != &cgroup_dummy_root);
4524 4809
4525 mutex_lock(&cgroup_mutex); 4810 mutex_lock(&cgroup_mutex);
4526 4811
4527 offline_css(ss, dummytop); 4812 offline_css(ss, cgroup_dummy_top);
4528 4813
4529 if (ss->use_id) 4814 if (ss->use_id)
4530 idr_destroy(&ss->idr); 4815 idr_destroy(&ss->idr);
4531 4816
4532 /* deassign the subsys_id */ 4817 /* deassign the subsys_id */
4533 subsys[ss->subsys_id] = NULL; 4818 cgroup_subsys[ss->subsys_id] = NULL;
4534 4819
4535 /* remove subsystem from rootnode's list of subsystems */ 4820 /* remove subsystem from the dummy root's list of subsystems */
4536 list_del_init(&ss->sibling); 4821 list_del_init(&ss->sibling);
4537 4822
4538 /* 4823 /*
4539 * disentangle the css from all css_sets attached to the dummytop. as 4824 * disentangle the css from all css_sets attached to the dummy
4540 * in loading, we need to pay our respects to the hashtable gods. 4825 * top. as in loading, we need to pay our respects to the hashtable
4826 * gods.
4541 */ 4827 */
4542 write_lock(&css_set_lock); 4828 write_lock(&css_set_lock);
4543 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { 4829 list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
4544 struct css_set *cg = link->cg; 4830 struct css_set *cset = link->cset;
4545 unsigned long key; 4831 unsigned long key;
4546 4832
4547 hash_del(&cg->hlist); 4833 hash_del(&cset->hlist);
4548 cg->subsys[ss->subsys_id] = NULL; 4834 cset->subsys[ss->subsys_id] = NULL;
4549 key = css_set_hash(cg->subsys); 4835 key = css_set_hash(cset->subsys);
4550 hash_add(css_set_table, &cg->hlist, key); 4836 hash_add(css_set_table, &cset->hlist, key);
4551 } 4837 }
4552 write_unlock(&css_set_lock); 4838 write_unlock(&css_set_lock);
4553 4839
4554 /* 4840 /*
4555 * remove subsystem's css from the dummytop and free it - need to 4841 * remove subsystem's css from the cgroup_dummy_top and free it -
4556 * free before marking as null because ss->css_free needs the 4842 * need to free before marking as null because ss->css_free needs
4557 * cgrp->subsys pointer to find their state. note that this also 4843 * the cgrp->subsys pointer to find their state. note that this
4558 * takes care of freeing the css_id. 4844 * also takes care of freeing the css_id.
4559 */ 4845 */
4560 ss->css_free(dummytop); 4846 ss->css_free(cgroup_dummy_top);
4561 dummytop->subsys[ss->subsys_id] = NULL; 4847 cgroup_dummy_top->subsys[ss->subsys_id] = NULL;
4562 4848
4563 mutex_unlock(&cgroup_mutex); 4849 mutex_unlock(&cgroup_mutex);
4564} 4850}
@@ -4572,30 +4858,25 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4572 */ 4858 */
4573int __init cgroup_init_early(void) 4859int __init cgroup_init_early(void)
4574{ 4860{
4861 struct cgroup_subsys *ss;
4575 int i; 4862 int i;
4863
4576 atomic_set(&init_css_set.refcount, 1); 4864 atomic_set(&init_css_set.refcount, 1);
4577 INIT_LIST_HEAD(&init_css_set.cg_links); 4865 INIT_LIST_HEAD(&init_css_set.cgrp_links);
4578 INIT_LIST_HEAD(&init_css_set.tasks); 4866 INIT_LIST_HEAD(&init_css_set.tasks);
4579 INIT_HLIST_NODE(&init_css_set.hlist); 4867 INIT_HLIST_NODE(&init_css_set.hlist);
4580 css_set_count = 1; 4868 css_set_count = 1;
4581 init_cgroup_root(&rootnode); 4869 init_cgroup_root(&cgroup_dummy_root);
4582 root_count = 1; 4870 cgroup_root_count = 1;
4583 init_task.cgroups = &init_css_set; 4871 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4584 4872
4585 init_css_set_link.cg = &init_css_set; 4873 init_cgrp_cset_link.cset = &init_css_set;
4586 init_css_set_link.cgrp = dummytop; 4874 init_cgrp_cset_link.cgrp = cgroup_dummy_top;
4587 list_add(&init_css_set_link.cgrp_link_list, 4875 list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links);
4588 &rootnode.top_cgroup.css_sets); 4876 list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
4589 list_add(&init_css_set_link.cg_link_list,
4590 &init_css_set.cg_links);
4591
4592 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4593 struct cgroup_subsys *ss = subsys[i];
4594
4595 /* at bootup time, we don't worry about modular subsystems */
4596 if (!ss || ss->module)
4597 continue;
4598 4877
4878 /* at bootup time, we don't worry about modular subsystems */
4879 for_each_builtin_subsys(ss, i) {
4599 BUG_ON(!ss->name); 4880 BUG_ON(!ss->name);
4600 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4881 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4601 BUG_ON(!ss->css_alloc); 4882 BUG_ON(!ss->css_alloc);
@@ -4620,30 +4901,33 @@ int __init cgroup_init_early(void)
4620 */ 4901 */
4621int __init cgroup_init(void) 4902int __init cgroup_init(void)
4622{ 4903{
4623 int err; 4904 struct cgroup_subsys *ss;
4624 int i;
4625 unsigned long key; 4905 unsigned long key;
4906 int i, err;
4626 4907
4627 err = bdi_init(&cgroup_backing_dev_info); 4908 err = bdi_init(&cgroup_backing_dev_info);
4628 if (err) 4909 if (err)
4629 return err; 4910 return err;
4630 4911
4631 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4912 for_each_builtin_subsys(ss, i) {
4632 struct cgroup_subsys *ss = subsys[i];
4633
4634 /* at bootup time, we don't worry about modular subsystems */
4635 if (!ss || ss->module)
4636 continue;
4637 if (!ss->early_init) 4913 if (!ss->early_init)
4638 cgroup_init_subsys(ss); 4914 cgroup_init_subsys(ss);
4639 if (ss->use_id) 4915 if (ss->use_id)
4640 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); 4916 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
4641 } 4917 }
4642 4918
4919 /* allocate id for the dummy hierarchy */
4920 mutex_lock(&cgroup_mutex);
4921 mutex_lock(&cgroup_root_mutex);
4922
4643 /* Add init_css_set to the hash table */ 4923 /* Add init_css_set to the hash table */
4644 key = css_set_hash(init_css_set.subsys); 4924 key = css_set_hash(init_css_set.subsys);
4645 hash_add(css_set_table, &init_css_set.hlist, key); 4925 hash_add(css_set_table, &init_css_set.hlist, key);
4646 BUG_ON(!init_root_id(&rootnode)); 4926
4927 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
4928
4929 mutex_unlock(&cgroup_root_mutex);
4930 mutex_unlock(&cgroup_mutex);
4647 4931
4648 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4932 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4649 if (!cgroup_kobj) { 4933 if (!cgroup_kobj) {
@@ -4708,7 +4992,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4708 int count = 0; 4992 int count = 0;
4709 4993
4710 seq_printf(m, "%d:", root->hierarchy_id); 4994 seq_printf(m, "%d:", root->hierarchy_id);
4711 for_each_subsys(root, ss) 4995 for_each_root_subsys(root, ss)
4712 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4996 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4713 if (strlen(root->name)) 4997 if (strlen(root->name))
4714 seq_printf(m, "%sname=%s", count ? "," : "", 4998 seq_printf(m, "%sname=%s", count ? "," : "",
@@ -4734,6 +5018,7 @@ out:
4734/* Display information about each subsystem and each hierarchy */ 5018/* Display information about each subsystem and each hierarchy */
4735static int proc_cgroupstats_show(struct seq_file *m, void *v) 5019static int proc_cgroupstats_show(struct seq_file *m, void *v)
4736{ 5020{
5021 struct cgroup_subsys *ss;
4737 int i; 5022 int i;
4738 5023
4739 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 5024 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
@@ -4743,14 +5028,12 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
4743 * subsys/hierarchy state. 5028 * subsys/hierarchy state.
4744 */ 5029 */
4745 mutex_lock(&cgroup_mutex); 5030 mutex_lock(&cgroup_mutex);
4746 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 5031
4747 struct cgroup_subsys *ss = subsys[i]; 5032 for_each_subsys(ss, i)
4748 if (ss == NULL)
4749 continue;
4750 seq_printf(m, "%s\t%d\t%d\t%d\n", 5033 seq_printf(m, "%s\t%d\t%d\t%d\n",
4751 ss->name, ss->root->hierarchy_id, 5034 ss->name, ss->root->hierarchy_id,
4752 ss->root->number_of_cgroups, !ss->disabled); 5035 ss->root->number_of_cgroups, !ss->disabled);
4753 } 5036
4754 mutex_unlock(&cgroup_mutex); 5037 mutex_unlock(&cgroup_mutex);
4755 return 0; 5038 return 0;
4756} 5039}
@@ -4786,8 +5069,8 @@ static const struct file_operations proc_cgroupstats_operations = {
4786void cgroup_fork(struct task_struct *child) 5069void cgroup_fork(struct task_struct *child)
4787{ 5070{
4788 task_lock(current); 5071 task_lock(current);
5072 get_css_set(task_css_set(current));
4789 child->cgroups = current->cgroups; 5073 child->cgroups = current->cgroups;
4790 get_css_set(child->cgroups);
4791 task_unlock(current); 5074 task_unlock(current);
4792 INIT_LIST_HEAD(&child->cg_list); 5075 INIT_LIST_HEAD(&child->cg_list);
4793} 5076}
@@ -4804,6 +5087,7 @@ void cgroup_fork(struct task_struct *child)
4804 */ 5087 */
4805void cgroup_post_fork(struct task_struct *child) 5088void cgroup_post_fork(struct task_struct *child)
4806{ 5089{
5090 struct cgroup_subsys *ss;
4807 int i; 5091 int i;
4808 5092
4809 /* 5093 /*
@@ -4821,7 +5105,7 @@ void cgroup_post_fork(struct task_struct *child)
4821 write_lock(&css_set_lock); 5105 write_lock(&css_set_lock);
4822 task_lock(child); 5106 task_lock(child);
4823 if (list_empty(&child->cg_list)) 5107 if (list_empty(&child->cg_list))
4824 list_add(&child->cg_list, &child->cgroups->tasks); 5108 list_add(&child->cg_list, &task_css_set(child)->tasks);
4825 task_unlock(child); 5109 task_unlock(child);
4826 write_unlock(&css_set_lock); 5110 write_unlock(&css_set_lock);
4827 } 5111 }
@@ -4840,12 +5124,9 @@ void cgroup_post_fork(struct task_struct *child)
4840 * of the array can be freed at module unload, so we 5124 * of the array can be freed at module unload, so we
4841 * can't touch that. 5125 * can't touch that.
4842 */ 5126 */
4843 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 5127 for_each_builtin_subsys(ss, i)
4844 struct cgroup_subsys *ss = subsys[i];
4845
4846 if (ss->fork) 5128 if (ss->fork)
4847 ss->fork(child); 5129 ss->fork(child);
4848 }
4849 } 5130 }
4850} 5131}
4851 5132
@@ -4886,7 +5167,8 @@ void cgroup_post_fork(struct task_struct *child)
4886 */ 5167 */
4887void cgroup_exit(struct task_struct *tsk, int run_callbacks) 5168void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4888{ 5169{
4889 struct css_set *cg; 5170 struct cgroup_subsys *ss;
5171 struct css_set *cset;
4890 int i; 5172 int i;
4891 5173
4892 /* 5174 /*
@@ -4903,36 +5185,32 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4903 5185
4904 /* Reassign the task to the init_css_set. */ 5186 /* Reassign the task to the init_css_set. */
4905 task_lock(tsk); 5187 task_lock(tsk);
4906 cg = tsk->cgroups; 5188 cset = task_css_set(tsk);
4907 tsk->cgroups = &init_css_set; 5189 RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
4908 5190
4909 if (run_callbacks && need_forkexit_callback) { 5191 if (run_callbacks && need_forkexit_callback) {
4910 /* 5192 /*
4911 * fork/exit callbacks are supported only for builtin 5193 * fork/exit callbacks are supported only for builtin
4912 * subsystems, see cgroup_post_fork() for details. 5194 * subsystems, see cgroup_post_fork() for details.
4913 */ 5195 */
4914 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 5196 for_each_builtin_subsys(ss, i) {
4915 struct cgroup_subsys *ss = subsys[i];
4916
4917 if (ss->exit) { 5197 if (ss->exit) {
4918 struct cgroup *old_cgrp = 5198 struct cgroup *old_cgrp = cset->subsys[i]->cgroup;
4919 rcu_dereference_raw(cg->subsys[i])->cgroup;
4920 struct cgroup *cgrp = task_cgroup(tsk, i); 5199 struct cgroup *cgrp = task_cgroup(tsk, i);
5200
4921 ss->exit(cgrp, old_cgrp, tsk); 5201 ss->exit(cgrp, old_cgrp, tsk);
4922 } 5202 }
4923 } 5203 }
4924 } 5204 }
4925 task_unlock(tsk); 5205 task_unlock(tsk);
4926 5206
4927 put_css_set_taskexit(cg); 5207 put_css_set_taskexit(cset);
4928} 5208}
4929 5209
4930static void check_for_release(struct cgroup *cgrp) 5210static void check_for_release(struct cgroup *cgrp)
4931{ 5211{
4932 /* All of these checks rely on RCU to keep the cgroup
4933 * structure alive */
4934 if (cgroup_is_releasable(cgrp) && 5212 if (cgroup_is_releasable(cgrp) &&
4935 !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) { 5213 list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
4936 /* 5214 /*
4937 * Control Group is currently removeable. If it's not 5215 * Control Group is currently removeable. If it's not
4938 * already queued for a userspace notification, queue 5216 * already queued for a userspace notification, queue
@@ -4941,7 +5219,7 @@ static void check_for_release(struct cgroup *cgrp)
4941 int need_schedule_work = 0; 5219 int need_schedule_work = 0;
4942 5220
4943 raw_spin_lock(&release_list_lock); 5221 raw_spin_lock(&release_list_lock);
4944 if (!cgroup_is_removed(cgrp) && 5222 if (!cgroup_is_dead(cgrp) &&
4945 list_empty(&cgrp->release_list)) { 5223 list_empty(&cgrp->release_list)) {
4946 list_add(&cgrp->release_list, &release_list); 5224 list_add(&cgrp->release_list, &release_list);
4947 need_schedule_work = 1; 5225 need_schedule_work = 1;
@@ -4952,34 +5230,6 @@ static void check_for_release(struct cgroup *cgrp)
4952 } 5230 }
4953} 5231}
4954 5232
4955/* Caller must verify that the css is not for root cgroup */
4956bool __css_tryget(struct cgroup_subsys_state *css)
4957{
4958 while (true) {
4959 int t, v;
4960
4961 v = css_refcnt(css);
4962 t = atomic_cmpxchg(&css->refcnt, v, v + 1);
4963 if (likely(t == v))
4964 return true;
4965 else if (t < 0)
4966 return false;
4967 cpu_relax();
4968 }
4969}
4970EXPORT_SYMBOL_GPL(__css_tryget);
4971
4972/* Caller must verify that the css is not for root cgroup */
4973void __css_put(struct cgroup_subsys_state *css)
4974{
4975 int v;
4976
4977 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
4978 if (v == 0)
4979 schedule_work(&css->dput_work);
4980}
4981EXPORT_SYMBOL_GPL(__css_put);
4982
4983/* 5233/*
4984 * Notify userspace when a cgroup is released, by running the 5234 * Notify userspace when a cgroup is released, by running the
4985 * configured release agent with the name of the cgroup (path 5235 * configured release agent with the name of the cgroup (path
@@ -5054,23 +5304,19 @@ static void cgroup_release_agent(struct work_struct *work)
5054 5304
5055static int __init cgroup_disable(char *str) 5305static int __init cgroup_disable(char *str)
5056{ 5306{
5057 int i; 5307 struct cgroup_subsys *ss;
5058 char *token; 5308 char *token;
5309 int i;
5059 5310
5060 while ((token = strsep(&str, ",")) != NULL) { 5311 while ((token = strsep(&str, ",")) != NULL) {
5061 if (!*token) 5312 if (!*token)
5062 continue; 5313 continue;
5063 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
5064 struct cgroup_subsys *ss = subsys[i];
5065
5066 /*
5067 * cgroup_disable, being at boot time, can't
5068 * know about module subsystems, so we don't
5069 * worry about them.
5070 */
5071 if (!ss || ss->module)
5072 continue;
5073 5314
5315 /*
5316 * cgroup_disable, being at boot time, can't know about
5317 * module subsystems, so we don't worry about them.
5318 */
5319 for_each_builtin_subsys(ss, i) {
5074 if (!strcmp(token, ss->name)) { 5320 if (!strcmp(token, ss->name)) {
5075 ss->disabled = 1; 5321 ss->disabled = 1;
5076 printk(KERN_INFO "Disabling %s control group" 5322 printk(KERN_INFO "Disabling %s control group"
@@ -5087,9 +5333,7 @@ __setup("cgroup_disable=", cgroup_disable);
5087 * Functons for CSS ID. 5333 * Functons for CSS ID.
5088 */ 5334 */
5089 5335
5090/* 5336/* to get ID other than 0, this should be called when !cgroup_is_dead() */
5091 *To get ID other than 0, this should be called when !cgroup_is_removed().
5092 */
5093unsigned short css_id(struct cgroup_subsys_state *css) 5337unsigned short css_id(struct cgroup_subsys_state *css)
5094{ 5338{
5095 struct css_id *cssid; 5339 struct css_id *cssid;
@@ -5099,7 +5343,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
5099 * on this or this is under rcu_read_lock(). Once css->id is allocated, 5343 * on this or this is under rcu_read_lock(). Once css->id is allocated,
5100 * it's unchanged until freed. 5344 * it's unchanged until freed.
5101 */ 5345 */
5102 cssid = rcu_dereference_check(css->id, css_refcnt(css)); 5346 cssid = rcu_dereference_raw(css->id);
5103 5347
5104 if (cssid) 5348 if (cssid)
5105 return cssid->id; 5349 return cssid->id;
@@ -5107,18 +5351,6 @@ unsigned short css_id(struct cgroup_subsys_state *css)
5107} 5351}
5108EXPORT_SYMBOL_GPL(css_id); 5352EXPORT_SYMBOL_GPL(css_id);
5109 5353
5110unsigned short css_depth(struct cgroup_subsys_state *css)
5111{
5112 struct css_id *cssid;
5113
5114 cssid = rcu_dereference_check(css->id, css_refcnt(css));
5115
5116 if (cssid)
5117 return cssid->depth;
5118 return 0;
5119}
5120EXPORT_SYMBOL_GPL(css_depth);
5121
5122/** 5354/**
5123 * css_is_ancestor - test "root" css is an ancestor of "child" 5355 * css_is_ancestor - test "root" css is an ancestor of "child"
5124 * @child: the css to be tested. 5356 * @child: the css to be tested.
@@ -5153,7 +5385,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
5153 5385
5154void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) 5386void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
5155{ 5387{
5156 struct css_id *id = css->id; 5388 struct css_id *id = rcu_dereference_protected(css->id, true);
5389
5157 /* When this is called before css_id initialization, id can be NULL */ 5390 /* When this is called before css_id initialization, id can be NULL */
5158 if (!id) 5391 if (!id)
5159 return; 5392 return;
@@ -5219,8 +5452,8 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
5219 return PTR_ERR(newid); 5452 return PTR_ERR(newid);
5220 5453
5221 newid->stack[0] = newid->id; 5454 newid->stack[0] = newid->id;
5222 newid->css = rootcss; 5455 RCU_INIT_POINTER(newid->css, rootcss);
5223 rootcss->id = newid; 5456 RCU_INIT_POINTER(rootcss->id, newid);
5224 return 0; 5457 return 0;
5225} 5458}
5226 5459
@@ -5234,7 +5467,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
5234 subsys_id = ss->subsys_id; 5467 subsys_id = ss->subsys_id;
5235 parent_css = parent->subsys[subsys_id]; 5468 parent_css = parent->subsys[subsys_id];
5236 child_css = child->subsys[subsys_id]; 5469 child_css = child->subsys[subsys_id];
5237 parent_id = parent_css->id; 5470 parent_id = rcu_dereference_protected(parent_css->id, true);
5238 depth = parent_id->depth + 1; 5471 depth = parent_id->depth + 1;
5239 5472
5240 child_id = get_new_cssid(ss, depth); 5473 child_id = get_new_cssid(ss, depth);
@@ -5299,7 +5532,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5299} 5532}
5300 5533
5301#ifdef CONFIG_CGROUP_DEBUG 5534#ifdef CONFIG_CGROUP_DEBUG
5302static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) 5535static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
5303{ 5536{
5304 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5537 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5305 5538
@@ -5309,48 +5542,43 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
5309 return css; 5542 return css;
5310} 5543}
5311 5544
5312static void debug_css_free(struct cgroup *cont) 5545static void debug_css_free(struct cgroup *cgrp)
5313{
5314 kfree(cont->subsys[debug_subsys_id]);
5315}
5316
5317static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
5318{ 5546{
5319 return atomic_read(&cont->count); 5547 kfree(cgrp->subsys[debug_subsys_id]);
5320} 5548}
5321 5549
5322static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) 5550static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft)
5323{ 5551{
5324 return cgroup_task_count(cont); 5552 return cgroup_task_count(cgrp);
5325} 5553}
5326 5554
5327static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) 5555static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft)
5328{ 5556{
5329 return (u64)(unsigned long)current->cgroups; 5557 return (u64)(unsigned long)current->cgroups;
5330} 5558}
5331 5559
5332static u64 current_css_set_refcount_read(struct cgroup *cont, 5560static u64 current_css_set_refcount_read(struct cgroup *cgrp,
5333 struct cftype *cft) 5561 struct cftype *cft)
5334{ 5562{
5335 u64 count; 5563 u64 count;
5336 5564
5337 rcu_read_lock(); 5565 rcu_read_lock();
5338 count = atomic_read(&current->cgroups->refcount); 5566 count = atomic_read(&task_css_set(current)->refcount);
5339 rcu_read_unlock(); 5567 rcu_read_unlock();
5340 return count; 5568 return count;
5341} 5569}
5342 5570
5343static int current_css_set_cg_links_read(struct cgroup *cont, 5571static int current_css_set_cg_links_read(struct cgroup *cgrp,
5344 struct cftype *cft, 5572 struct cftype *cft,
5345 struct seq_file *seq) 5573 struct seq_file *seq)
5346{ 5574{
5347 struct cg_cgroup_link *link; 5575 struct cgrp_cset_link *link;
5348 struct css_set *cg; 5576 struct css_set *cset;
5349 5577
5350 read_lock(&css_set_lock); 5578 read_lock(&css_set_lock);
5351 rcu_read_lock(); 5579 rcu_read_lock();
5352 cg = rcu_dereference(current->cgroups); 5580 cset = rcu_dereference(current->cgroups);
5353 list_for_each_entry(link, &cg->cg_links, cg_link_list) { 5581 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
5354 struct cgroup *c = link->cgrp; 5582 struct cgroup *c = link->cgrp;
5355 const char *name; 5583 const char *name;
5356 5584
@@ -5367,19 +5595,19 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
5367} 5595}
5368 5596
5369#define MAX_TASKS_SHOWN_PER_CSS 25 5597#define MAX_TASKS_SHOWN_PER_CSS 25
5370static int cgroup_css_links_read(struct cgroup *cont, 5598static int cgroup_css_links_read(struct cgroup *cgrp,
5371 struct cftype *cft, 5599 struct cftype *cft,
5372 struct seq_file *seq) 5600 struct seq_file *seq)
5373{ 5601{
5374 struct cg_cgroup_link *link; 5602 struct cgrp_cset_link *link;
5375 5603
5376 read_lock(&css_set_lock); 5604 read_lock(&css_set_lock);
5377 list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { 5605 list_for_each_entry(link, &cgrp->cset_links, cset_link) {
5378 struct css_set *cg = link->cg; 5606 struct css_set *cset = link->cset;
5379 struct task_struct *task; 5607 struct task_struct *task;
5380 int count = 0; 5608 int count = 0;
5381 seq_printf(seq, "css_set %p\n", cg); 5609 seq_printf(seq, "css_set %p\n", cset);
5382 list_for_each_entry(task, &cg->tasks, cg_list) { 5610 list_for_each_entry(task, &cset->tasks, cg_list) {
5383 if (count++ > MAX_TASKS_SHOWN_PER_CSS) { 5611 if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
5384 seq_puts(seq, " ...\n"); 5612 seq_puts(seq, " ...\n");
5385 break; 5613 break;
@@ -5400,10 +5628,6 @@ static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
5400 5628
5401static struct cftype debug_files[] = { 5629static struct cftype debug_files[] = {
5402 { 5630 {
5403 .name = "cgroup_refcount",
5404 .read_u64 = cgroup_refcount_read,
5405 },
5406 {
5407 .name = "taskcount", 5631 .name = "taskcount",
5408 .read_u64 = debug_taskcount_read, 5632 .read_u64 = debug_taskcount_read,
5409 }, 5633 },
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 902d13fc2b13..e5657788fedd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -59,6 +59,7 @@
59#include <linux/mutex.h> 59#include <linux/mutex.h>
60#include <linux/workqueue.h> 60#include <linux/workqueue.h>
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
62#include <linux/wait.h>
62 63
63/* 64/*
64 * Tracks how many cpusets are currently defined in system. 65 * Tracks how many cpusets are currently defined in system.
@@ -87,6 +88,18 @@ struct cpuset {
87 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 88 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
88 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 89 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
89 90
91 /*
92 * This is old Memory Nodes tasks took on.
93 *
94 * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
95 * - A new cpuset's old_mems_allowed is initialized when some
96 * task is moved into it.
97 * - old_mems_allowed is used in cpuset_migrate_mm() when we change
98 * cpuset.mems_allowed and have tasks' nodemask updated, and
99 * then old_mems_allowed is updated to mems_allowed.
100 */
101 nodemask_t old_mems_allowed;
102
90 struct fmeter fmeter; /* memory_pressure filter */ 103 struct fmeter fmeter; /* memory_pressure filter */
91 104
92 /* 105 /*
@@ -100,14 +113,12 @@ struct cpuset {
100 113
101 /* for custom sched domain */ 114 /* for custom sched domain */
102 int relax_domain_level; 115 int relax_domain_level;
103
104 struct work_struct hotplug_work;
105}; 116};
106 117
107/* Retrieve the cpuset for a cgroup */ 118/* Retrieve the cpuset for a cgroup */
108static inline struct cpuset *cgroup_cs(struct cgroup *cont) 119static inline struct cpuset *cgroup_cs(struct cgroup *cgrp)
109{ 120{
110 return container_of(cgroup_subsys_state(cont, cpuset_subsys_id), 121 return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id),
111 struct cpuset, css); 122 struct cpuset, css);
112} 123}
113 124
@@ -267,14 +278,11 @@ static DEFINE_MUTEX(callback_mutex);
267/* 278/*
268 * CPU / memory hotplug is handled asynchronously. 279 * CPU / memory hotplug is handled asynchronously.
269 */ 280 */
270static struct workqueue_struct *cpuset_propagate_hotplug_wq;
271
272static void cpuset_hotplug_workfn(struct work_struct *work); 281static void cpuset_hotplug_workfn(struct work_struct *work);
273static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
274static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
275
276static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); 282static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
277 283
284static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
285
278/* 286/*
279 * This is ugly, but preserves the userspace API for existing cpuset 287 * This is ugly, but preserves the userspace API for existing cpuset
280 * users. If someone tries to mount the "cpuset" filesystem, we 288 * users. If someone tries to mount the "cpuset" filesystem, we
@@ -304,53 +312,38 @@ static struct file_system_type cpuset_fs_type = {
304/* 312/*
305 * Return in pmask the portion of a cpusets's cpus_allowed that 313 * Return in pmask the portion of a cpusets's cpus_allowed that
306 * are online. If none are online, walk up the cpuset hierarchy 314 * are online. If none are online, walk up the cpuset hierarchy
307 * until we find one that does have some online cpus. If we get 315 * until we find one that does have some online cpus. The top
308 * all the way to the top and still haven't found any online cpus, 316 * cpuset always has some cpus online.
309 * return cpu_online_mask. Or if passed a NULL cs from an exit'ing
310 * task, return cpu_online_mask.
311 * 317 *
312 * One way or another, we guarantee to return some non-empty subset 318 * One way or another, we guarantee to return some non-empty subset
313 * of cpu_online_mask. 319 * of cpu_online_mask.
314 * 320 *
315 * Call with callback_mutex held. 321 * Call with callback_mutex held.
316 */ 322 */
317
318static void guarantee_online_cpus(const struct cpuset *cs, 323static void guarantee_online_cpus(const struct cpuset *cs,
319 struct cpumask *pmask) 324 struct cpumask *pmask)
320{ 325{
321 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 326 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
322 cs = parent_cs(cs); 327 cs = parent_cs(cs);
323 if (cs) 328 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
324 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
325 else
326 cpumask_copy(pmask, cpu_online_mask);
327 BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
328} 329}
329 330
330/* 331/*
331 * Return in *pmask the portion of a cpusets's mems_allowed that 332 * Return in *pmask the portion of a cpusets's mems_allowed that
332 * are online, with memory. If none are online with memory, walk 333 * are online, with memory. If none are online with memory, walk
333 * up the cpuset hierarchy until we find one that does have some 334 * up the cpuset hierarchy until we find one that does have some
334 * online mems. If we get all the way to the top and still haven't 335 * online mems. The top cpuset always has some mems online.
335 * found any online mems, return node_states[N_MEMORY].
336 * 336 *
337 * One way or another, we guarantee to return some non-empty subset 337 * One way or another, we guarantee to return some non-empty subset
338 * of node_states[N_MEMORY]. 338 * of node_states[N_MEMORY].
339 * 339 *
340 * Call with callback_mutex held. 340 * Call with callback_mutex held.
341 */ 341 */
342
343static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 342static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
344{ 343{
345 while (cs && !nodes_intersects(cs->mems_allowed, 344 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
346 node_states[N_MEMORY]))
347 cs = parent_cs(cs); 345 cs = parent_cs(cs);
348 if (cs) 346 nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
349 nodes_and(*pmask, cs->mems_allowed,
350 node_states[N_MEMORY]);
351 else
352 *pmask = node_states[N_MEMORY];
353 BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY]));
354} 347}
355 348
356/* 349/*
@@ -440,7 +433,7 @@ static void free_trial_cpuset(struct cpuset *trial)
440 433
441static int validate_change(const struct cpuset *cur, const struct cpuset *trial) 434static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
442{ 435{
443 struct cgroup *cont; 436 struct cgroup *cgrp;
444 struct cpuset *c, *par; 437 struct cpuset *c, *par;
445 int ret; 438 int ret;
446 439
@@ -448,7 +441,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
448 441
449 /* Each of our child cpusets must be a subset of us */ 442 /* Each of our child cpusets must be a subset of us */
450 ret = -EBUSY; 443 ret = -EBUSY;
451 cpuset_for_each_child(c, cont, cur) 444 cpuset_for_each_child(c, cgrp, cur)
452 if (!is_cpuset_subset(c, trial)) 445 if (!is_cpuset_subset(c, trial))
453 goto out; 446 goto out;
454 447
@@ -469,7 +462,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
469 * overlap 462 * overlap
470 */ 463 */
471 ret = -EINVAL; 464 ret = -EINVAL;
472 cpuset_for_each_child(c, cont, par) { 465 cpuset_for_each_child(c, cgrp, par) {
473 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 466 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
474 c != cur && 467 c != cur &&
475 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 468 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
@@ -486,7 +479,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
486 */ 479 */
487 ret = -ENOSPC; 480 ret = -ENOSPC;
488 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && 481 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
489 (cpumask_empty(trial->cpus_allowed) || 482 (cpumask_empty(trial->cpus_allowed) &&
490 nodes_empty(trial->mems_allowed))) 483 nodes_empty(trial->mems_allowed)))
491 goto out; 484 goto out;
492 485
@@ -798,21 +791,43 @@ void rebuild_sched_domains(void)
798 mutex_unlock(&cpuset_mutex); 791 mutex_unlock(&cpuset_mutex);
799} 792}
800 793
801/** 794/*
802 * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's 795 * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
803 * @tsk: task to test 796 * @cs: the cpuset in interest
804 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
805 * 797 *
806 * Call with cpuset_mutex held. May take callback_mutex during call. 798 * A cpuset's effective cpumask is the cpumask of the nearest ancestor
807 * Called for each task in a cgroup by cgroup_scan_tasks(). 799 * with non-empty cpus. We use effective cpumask whenever:
808 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other 800 * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
809 * words, if its mask is not equal to its cpuset's mask). 801 * if the cpuset they reside in has no cpus)
802 * - we want to retrieve task_cs(tsk)'s cpus_allowed.
803 *
804 * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
805 * exception. See comments there.
810 */ 806 */
811static int cpuset_test_cpumask(struct task_struct *tsk, 807static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
812 struct cgroup_scanner *scan)
813{ 808{
814 return !cpumask_equal(&tsk->cpus_allowed, 809 while (cpumask_empty(cs->cpus_allowed))
815 (cgroup_cs(scan->cg))->cpus_allowed); 810 cs = parent_cs(cs);
811 return cs;
812}
813
814/*
815 * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
816 * @cs: the cpuset in interest
817 *
818 * A cpuset's effective nodemask is the nodemask of the nearest ancestor
819 * with non-empty memss. We use effective nodemask whenever:
820 * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
821 * if the cpuset they reside in has no mems)
822 * - we want to retrieve task_cs(tsk)'s mems_allowed.
823 *
824 * Called with cpuset_mutex held.
825 */
826static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
827{
828 while (nodes_empty(cs->mems_allowed))
829 cs = parent_cs(cs);
830 return cs;
816} 831}
817 832
818/** 833/**
@@ -829,7 +844,10 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
829static void cpuset_change_cpumask(struct task_struct *tsk, 844static void cpuset_change_cpumask(struct task_struct *tsk,
830 struct cgroup_scanner *scan) 845 struct cgroup_scanner *scan)
831{ 846{
832 set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed)); 847 struct cpuset *cpus_cs;
848
849 cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg));
850 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
833} 851}
834 852
835/** 853/**
@@ -850,12 +868,51 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
850 struct cgroup_scanner scan; 868 struct cgroup_scanner scan;
851 869
852 scan.cg = cs->css.cgroup; 870 scan.cg = cs->css.cgroup;
853 scan.test_task = cpuset_test_cpumask; 871 scan.test_task = NULL;
854 scan.process_task = cpuset_change_cpumask; 872 scan.process_task = cpuset_change_cpumask;
855 scan.heap = heap; 873 scan.heap = heap;
856 cgroup_scan_tasks(&scan); 874 cgroup_scan_tasks(&scan);
857} 875}
858 876
877/*
878 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
879 * @root_cs: the root cpuset of the hierarchy
880 * @update_root: update root cpuset or not?
881 * @heap: the heap used by cgroup_scan_tasks()
882 *
883 * This will update cpumasks of tasks in @root_cs and all other empty cpusets
884 * which take on cpumask of @root_cs.
885 *
886 * Called with cpuset_mutex held
887 */
888static void update_tasks_cpumask_hier(struct cpuset *root_cs,
889 bool update_root, struct ptr_heap *heap)
890{
891 struct cpuset *cp;
892 struct cgroup *pos_cgrp;
893
894 if (update_root)
895 update_tasks_cpumask(root_cs, heap);
896
897 rcu_read_lock();
898 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
899 /* skip the whole subtree if @cp have some CPU */
900 if (!cpumask_empty(cp->cpus_allowed)) {
901 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
902 continue;
903 }
904 if (!css_tryget(&cp->css))
905 continue;
906 rcu_read_unlock();
907
908 update_tasks_cpumask(cp, heap);
909
910 rcu_read_lock();
911 css_put(&cp->css);
912 }
913 rcu_read_unlock();
914}
915
859/** 916/**
860 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 917 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
861 * @cs: the cpuset to consider 918 * @cs: the cpuset to consider
@@ -888,14 +945,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
888 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) 945 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
889 return -EINVAL; 946 return -EINVAL;
890 } 947 }
891 retval = validate_change(cs, trialcs);
892 if (retval < 0)
893 return retval;
894 948
895 /* Nothing to do if the cpus didn't change */ 949 /* Nothing to do if the cpus didn't change */
896 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) 950 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
897 return 0; 951 return 0;
898 952
953 retval = validate_change(cs, trialcs);
954 if (retval < 0)
955 return retval;
956
899 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); 957 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
900 if (retval) 958 if (retval)
901 return retval; 959 return retval;
@@ -906,11 +964,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
906 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 964 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
907 mutex_unlock(&callback_mutex); 965 mutex_unlock(&callback_mutex);
908 966
909 /* 967 update_tasks_cpumask_hier(cs, true, &heap);
910 * Scan tasks in the cpuset, and update the cpumasks of any
911 * that need an update.
912 */
913 update_tasks_cpumask(cs, &heap);
914 968
915 heap_free(&heap); 969 heap_free(&heap);
916 970
@@ -943,12 +997,14 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
943 const nodemask_t *to) 997 const nodemask_t *to)
944{ 998{
945 struct task_struct *tsk = current; 999 struct task_struct *tsk = current;
1000 struct cpuset *mems_cs;
946 1001
947 tsk->mems_allowed = *to; 1002 tsk->mems_allowed = *to;
948 1003
949 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 1004 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
950 1005
951 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); 1006 mems_cs = effective_nodemask_cpuset(task_cs(tsk));
1007 guarantee_online_mems(mems_cs, &tsk->mems_allowed);
952} 1008}
953 1009
954/* 1010/*
@@ -1007,16 +1063,12 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1007static void cpuset_change_nodemask(struct task_struct *p, 1063static void cpuset_change_nodemask(struct task_struct *p,
1008 struct cgroup_scanner *scan) 1064 struct cgroup_scanner *scan)
1009{ 1065{
1066 struct cpuset *cs = cgroup_cs(scan->cg);
1010 struct mm_struct *mm; 1067 struct mm_struct *mm;
1011 struct cpuset *cs;
1012 int migrate; 1068 int migrate;
1013 const nodemask_t *oldmem = scan->data; 1069 nodemask_t *newmems = scan->data;
1014 static nodemask_t newmems; /* protected by cpuset_mutex */
1015
1016 cs = cgroup_cs(scan->cg);
1017 guarantee_online_mems(cs, &newmems);
1018 1070
1019 cpuset_change_task_nodemask(p, &newmems); 1071 cpuset_change_task_nodemask(p, newmems);
1020 1072
1021 mm = get_task_mm(p); 1073 mm = get_task_mm(p);
1022 if (!mm) 1074 if (!mm)
@@ -1026,7 +1078,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
1026 1078
1027 mpol_rebind_mm(mm, &cs->mems_allowed); 1079 mpol_rebind_mm(mm, &cs->mems_allowed);
1028 if (migrate) 1080 if (migrate)
1029 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); 1081 cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems);
1030 mmput(mm); 1082 mmput(mm);
1031} 1083}
1032 1084
@@ -1035,25 +1087,27 @@ static void *cpuset_being_rebound;
1035/** 1087/**
1036 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1088 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1037 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1089 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1038 * @oldmem: old mems_allowed of cpuset cs
1039 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1090 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1040 * 1091 *
1041 * Called with cpuset_mutex held 1092 * Called with cpuset_mutex held
1042 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1093 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1043 * if @heap != NULL. 1094 * if @heap != NULL.
1044 */ 1095 */
1045static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, 1096static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1046 struct ptr_heap *heap)
1047{ 1097{
1098 static nodemask_t newmems; /* protected by cpuset_mutex */
1048 struct cgroup_scanner scan; 1099 struct cgroup_scanner scan;
1100 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1049 1101
1050 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1102 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1051 1103
1104 guarantee_online_mems(mems_cs, &newmems);
1105
1052 scan.cg = cs->css.cgroup; 1106 scan.cg = cs->css.cgroup;
1053 scan.test_task = NULL; 1107 scan.test_task = NULL;
1054 scan.process_task = cpuset_change_nodemask; 1108 scan.process_task = cpuset_change_nodemask;
1055 scan.heap = heap; 1109 scan.heap = heap;
1056 scan.data = (nodemask_t *)oldmem; 1110 scan.data = &newmems;
1057 1111
1058 /* 1112 /*
1059 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't 1113 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
@@ -1067,11 +1121,56 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1067 */ 1121 */
1068 cgroup_scan_tasks(&scan); 1122 cgroup_scan_tasks(&scan);
1069 1123
1124 /*
1125 * All the tasks' nodemasks have been updated, update
1126 * cs->old_mems_allowed.
1127 */
1128 cs->old_mems_allowed = newmems;
1129
1070 /* We're done rebinding vmas to this cpuset's new mems_allowed. */ 1130 /* We're done rebinding vmas to this cpuset's new mems_allowed. */
1071 cpuset_being_rebound = NULL; 1131 cpuset_being_rebound = NULL;
1072} 1132}
1073 1133
1074/* 1134/*
1135 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
1136 * @cs: the root cpuset of the hierarchy
1137 * @update_root: update the root cpuset or not?
1138 * @heap: the heap used by cgroup_scan_tasks()
1139 *
1140 * This will update nodemasks of tasks in @root_cs and all other empty cpusets
1141 * which take on nodemask of @root_cs.
1142 *
1143 * Called with cpuset_mutex held
1144 */
1145static void update_tasks_nodemask_hier(struct cpuset *root_cs,
1146 bool update_root, struct ptr_heap *heap)
1147{
1148 struct cpuset *cp;
1149 struct cgroup *pos_cgrp;
1150
1151 if (update_root)
1152 update_tasks_nodemask(root_cs, heap);
1153
1154 rcu_read_lock();
1155 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
1156 /* skip the whole subtree if @cp have some CPU */
1157 if (!nodes_empty(cp->mems_allowed)) {
1158 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
1159 continue;
1160 }
1161 if (!css_tryget(&cp->css))
1162 continue;
1163 rcu_read_unlock();
1164
1165 update_tasks_nodemask(cp, heap);
1166
1167 rcu_read_lock();
1168 css_put(&cp->css);
1169 }
1170 rcu_read_unlock();
1171}
1172
1173/*
1075 * Handle user request to change the 'mems' memory placement 1174 * Handle user request to change the 'mems' memory placement
1076 * of a cpuset. Needs to validate the request, update the 1175 * of a cpuset. Needs to validate the request, update the
1077 * cpusets mems_allowed, and for each task in the cpuset, 1176 * cpusets mems_allowed, and for each task in the cpuset,
@@ -1087,13 +1186,9 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1087static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, 1186static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1088 const char *buf) 1187 const char *buf)
1089{ 1188{
1090 NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
1091 int retval; 1189 int retval;
1092 struct ptr_heap heap; 1190 struct ptr_heap heap;
1093 1191
1094 if (!oldmem)
1095 return -ENOMEM;
1096
1097 /* 1192 /*
1098 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; 1193 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
1099 * it's read-only 1194 * it's read-only
@@ -1122,8 +1217,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1122 goto done; 1217 goto done;
1123 } 1218 }
1124 } 1219 }
1125 *oldmem = cs->mems_allowed; 1220
1126 if (nodes_equal(*oldmem, trialcs->mems_allowed)) { 1221 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1127 retval = 0; /* Too easy - nothing to do */ 1222 retval = 0; /* Too easy - nothing to do */
1128 goto done; 1223 goto done;
1129 } 1224 }
@@ -1139,11 +1234,10 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1139 cs->mems_allowed = trialcs->mems_allowed; 1234 cs->mems_allowed = trialcs->mems_allowed;
1140 mutex_unlock(&callback_mutex); 1235 mutex_unlock(&callback_mutex);
1141 1236
1142 update_tasks_nodemask(cs, oldmem, &heap); 1237 update_tasks_nodemask_hier(cs, true, &heap);
1143 1238
1144 heap_free(&heap); 1239 heap_free(&heap);
1145done: 1240done:
1146 NODEMASK_FREE(oldmem);
1147 return retval; 1241 return retval;
1148} 1242}
1149 1243
@@ -1372,8 +1466,13 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1372 1466
1373 mutex_lock(&cpuset_mutex); 1467 mutex_lock(&cpuset_mutex);
1374 1468
1469 /*
1470 * We allow to move tasks into an empty cpuset if sane_behavior
1471 * flag is set.
1472 */
1375 ret = -ENOSPC; 1473 ret = -ENOSPC;
1376 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1474 if (!cgroup_sane_behavior(cgrp) &&
1475 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1377 goto out_unlock; 1476 goto out_unlock;
1378 1477
1379 cgroup_taskset_for_each(task, cgrp, tset) { 1478 cgroup_taskset_for_each(task, cgrp, tset) {
@@ -1422,8 +1521,7 @@ static cpumask_var_t cpus_attach;
1422 1521
1423static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1522static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1424{ 1523{
1425 /* static bufs protected by cpuset_mutex */ 1524 /* static buf protected by cpuset_mutex */
1426 static nodemask_t cpuset_attach_nodemask_from;
1427 static nodemask_t cpuset_attach_nodemask_to; 1525 static nodemask_t cpuset_attach_nodemask_to;
1428 struct mm_struct *mm; 1526 struct mm_struct *mm;
1429 struct task_struct *task; 1527 struct task_struct *task;
@@ -1431,6 +1529,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1431 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); 1529 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
1432 struct cpuset *cs = cgroup_cs(cgrp); 1530 struct cpuset *cs = cgroup_cs(cgrp);
1433 struct cpuset *oldcs = cgroup_cs(oldcgrp); 1531 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1532 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1533 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1434 1534
1435 mutex_lock(&cpuset_mutex); 1535 mutex_lock(&cpuset_mutex);
1436 1536
@@ -1438,9 +1538,9 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1438 if (cs == &top_cpuset) 1538 if (cs == &top_cpuset)
1439 cpumask_copy(cpus_attach, cpu_possible_mask); 1539 cpumask_copy(cpus_attach, cpu_possible_mask);
1440 else 1540 else
1441 guarantee_online_cpus(cs, cpus_attach); 1541 guarantee_online_cpus(cpus_cs, cpus_attach);
1442 1542
1443 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 1543 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
1444 1544
1445 cgroup_taskset_for_each(task, cgrp, tset) { 1545 cgroup_taskset_for_each(task, cgrp, tset) {
1446 /* 1546 /*
@@ -1457,26 +1557,32 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1457 * Change mm, possibly for multiple threads in a threadgroup. This is 1557 * Change mm, possibly for multiple threads in a threadgroup. This is
1458 * expensive and may sleep. 1558 * expensive and may sleep.
1459 */ 1559 */
1460 cpuset_attach_nodemask_from = oldcs->mems_allowed;
1461 cpuset_attach_nodemask_to = cs->mems_allowed; 1560 cpuset_attach_nodemask_to = cs->mems_allowed;
1462 mm = get_task_mm(leader); 1561 mm = get_task_mm(leader);
1463 if (mm) { 1562 if (mm) {
1563 struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
1564
1464 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 1565 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1465 if (is_memory_migrate(cs)) 1566
1466 cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, 1567 /*
1568 * old_mems_allowed is the same with mems_allowed here, except
1569 * if this task is being moved automatically due to hotplug.
1570 * In that case @mems_allowed has been updated and is empty,
1571 * so @old_mems_allowed is the right nodesets that we migrate
1572 * mm from.
1573 */
1574 if (is_memory_migrate(cs)) {
1575 cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
1467 &cpuset_attach_nodemask_to); 1576 &cpuset_attach_nodemask_to);
1577 }
1468 mmput(mm); 1578 mmput(mm);
1469 } 1579 }
1470 1580
1471 cs->attach_in_progress--; 1581 cs->old_mems_allowed = cpuset_attach_nodemask_to;
1472 1582
1473 /* 1583 cs->attach_in_progress--;
1474 * We may have raced with CPU/memory hotunplug. Trigger hotplug 1584 if (!cs->attach_in_progress)
1475 * propagation if @cs doesn't have any CPU or memory. It will move 1585 wake_up(&cpuset_attach_wq);
1476 * the newly added tasks to the nearest parent which can execute.
1477 */
1478 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1479 schedule_cpuset_propagate_hotplug(cs);
1480 1586
1481 mutex_unlock(&cpuset_mutex); 1587 mutex_unlock(&cpuset_mutex);
1482} 1588}
@@ -1588,13 +1694,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1588 * resources, wait for the previously scheduled operations before 1694 * resources, wait for the previously scheduled operations before
1589 * proceeding, so that we don't end up keep removing tasks added 1695 * proceeding, so that we don't end up keep removing tasks added
1590 * after execution capability is restored. 1696 * after execution capability is restored.
1591 *
1592 * Flushing cpuset_hotplug_work is enough to synchronize against
1593 * hotplug hanlding; however, cpuset_attach() may schedule
1594 * propagation work directly. Flush the workqueue too.
1595 */ 1697 */
1596 flush_work(&cpuset_hotplug_work); 1698 flush_work(&cpuset_hotplug_work);
1597 flush_workqueue(cpuset_propagate_hotplug_wq);
1598 1699
1599 mutex_lock(&cpuset_mutex); 1700 mutex_lock(&cpuset_mutex);
1600 if (!is_cpuset_online(cs)) 1701 if (!is_cpuset_online(cs))
@@ -1658,13 +1759,13 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1658 return count; 1759 return count;
1659} 1760}
1660 1761
1661static ssize_t cpuset_common_file_read(struct cgroup *cont, 1762static ssize_t cpuset_common_file_read(struct cgroup *cgrp,
1662 struct cftype *cft, 1763 struct cftype *cft,
1663 struct file *file, 1764 struct file *file,
1664 char __user *buf, 1765 char __user *buf,
1665 size_t nbytes, loff_t *ppos) 1766 size_t nbytes, loff_t *ppos)
1666{ 1767{
1667 struct cpuset *cs = cgroup_cs(cont); 1768 struct cpuset *cs = cgroup_cs(cgrp);
1668 cpuset_filetype_t type = cft->private; 1769 cpuset_filetype_t type = cft->private;
1669 char *page; 1770 char *page;
1670 ssize_t retval = 0; 1771 ssize_t retval = 0;
@@ -1694,9 +1795,9 @@ out:
1694 return retval; 1795 return retval;
1695} 1796}
1696 1797
1697static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) 1798static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft)
1698{ 1799{
1699 struct cpuset *cs = cgroup_cs(cont); 1800 struct cpuset *cs = cgroup_cs(cgrp);
1700 cpuset_filetype_t type = cft->private; 1801 cpuset_filetype_t type = cft->private;
1701 switch (type) { 1802 switch (type) {
1702 case FILE_CPU_EXCLUSIVE: 1803 case FILE_CPU_EXCLUSIVE:
@@ -1725,9 +1826,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1725 return 0; 1826 return 0;
1726} 1827}
1727 1828
1728static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) 1829static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft)
1729{ 1830{
1730 struct cpuset *cs = cgroup_cs(cont); 1831 struct cpuset *cs = cgroup_cs(cgrp);
1731 cpuset_filetype_t type = cft->private; 1832 cpuset_filetype_t type = cft->private;
1732 switch (type) { 1833 switch (type) {
1733 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1834 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1839,14 +1940,14 @@ static struct cftype files[] = {
1839 1940
1840/* 1941/*
1841 * cpuset_css_alloc - allocate a cpuset css 1942 * cpuset_css_alloc - allocate a cpuset css
1842 * cont: control group that the new cpuset will be part of 1943 * cgrp: control group that the new cpuset will be part of
1843 */ 1944 */
1844 1945
1845static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) 1946static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp)
1846{ 1947{
1847 struct cpuset *cs; 1948 struct cpuset *cs;
1848 1949
1849 if (!cont->parent) 1950 if (!cgrp->parent)
1850 return &top_cpuset.css; 1951 return &top_cpuset.css;
1851 1952
1852 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 1953 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
@@ -1861,7 +1962,6 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1861 cpumask_clear(cs->cpus_allowed); 1962 cpumask_clear(cs->cpus_allowed);
1862 nodes_clear(cs->mems_allowed); 1963 nodes_clear(cs->mems_allowed);
1863 fmeter_init(&cs->fmeter); 1964 fmeter_init(&cs->fmeter);
1864 INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
1865 cs->relax_domain_level = -1; 1965 cs->relax_domain_level = -1;
1866 1966
1867 return &cs->css; 1967 return &cs->css;
@@ -1942,9 +2042,9 @@ static void cpuset_css_offline(struct cgroup *cgrp)
1942 * will call rebuild_sched_domains_locked(). 2042 * will call rebuild_sched_domains_locked().
1943 */ 2043 */
1944 2044
1945static void cpuset_css_free(struct cgroup *cont) 2045static void cpuset_css_free(struct cgroup *cgrp)
1946{ 2046{
1947 struct cpuset *cs = cgroup_cs(cont); 2047 struct cpuset *cs = cgroup_cs(cgrp);
1948 2048
1949 free_cpumask_var(cs->cpus_allowed); 2049 free_cpumask_var(cs->cpus_allowed);
1950 kfree(cs); 2050 kfree(cs);
@@ -2024,41 +2124,64 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2024} 2124}
2025 2125
2026/** 2126/**
2027 * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset 2127 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
2028 * @cs: cpuset in interest 2128 * @cs: cpuset in interest
2029 * 2129 *
2030 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone 2130 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
2031 * offline, update @cs accordingly. If @cs ends up with no CPU or memory, 2131 * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
2032 * all its tasks are moved to the nearest ancestor with both resources. 2132 * all its tasks are moved to the nearest ancestor with both resources.
2033 */ 2133 */
2034static void cpuset_propagate_hotplug_workfn(struct work_struct *work) 2134static void cpuset_hotplug_update_tasks(struct cpuset *cs)
2035{ 2135{
2036 static cpumask_t off_cpus; 2136 static cpumask_t off_cpus;
2037 static nodemask_t off_mems, tmp_mems; 2137 static nodemask_t off_mems;
2038 struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
2039 bool is_empty; 2138 bool is_empty;
2139 bool sane = cgroup_sane_behavior(cs->css.cgroup);
2140
2141retry:
2142 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
2040 2143
2041 mutex_lock(&cpuset_mutex); 2144 mutex_lock(&cpuset_mutex);
2042 2145
2146 /*
2147 * We have raced with task attaching. We wait until attaching
2148 * is finished, so we won't attach a task to an empty cpuset.
2149 */
2150 if (cs->attach_in_progress) {
2151 mutex_unlock(&cpuset_mutex);
2152 goto retry;
2153 }
2154
2043 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); 2155 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
2044 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); 2156 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
2045 2157
2046 /* remove offline cpus from @cs */ 2158 mutex_lock(&callback_mutex);
2047 if (!cpumask_empty(&off_cpus)) { 2159 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
2048 mutex_lock(&callback_mutex); 2160 mutex_unlock(&callback_mutex);
2049 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); 2161
2050 mutex_unlock(&callback_mutex); 2162 /*
2163 * If sane_behavior flag is set, we need to update tasks' cpumask
2164 * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
2165 * call update_tasks_cpumask() if the cpuset becomes empty, as
2166 * the tasks in it will be migrated to an ancestor.
2167 */
2168 if ((sane && cpumask_empty(cs->cpus_allowed)) ||
2169 (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
2051 update_tasks_cpumask(cs, NULL); 2170 update_tasks_cpumask(cs, NULL);
2052 }
2053 2171
2054 /* remove offline mems from @cs */ 2172 mutex_lock(&callback_mutex);
2055 if (!nodes_empty(off_mems)) { 2173 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
2056 tmp_mems = cs->mems_allowed; 2174 mutex_unlock(&callback_mutex);
2057 mutex_lock(&callback_mutex); 2175
2058 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); 2176 /*
2059 mutex_unlock(&callback_mutex); 2177 * If sane_behavior flag is set, we need to update tasks' nodemask
2060 update_tasks_nodemask(cs, &tmp_mems, NULL); 2178 * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
2061 } 2179 * call update_tasks_nodemask() if the cpuset becomes empty, as
2180 * the tasks in it will be migratd to an ancestor.
2181 */
2182 if ((sane && nodes_empty(cs->mems_allowed)) ||
2183 (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
2184 update_tasks_nodemask(cs, NULL);
2062 2185
2063 is_empty = cpumask_empty(cs->cpus_allowed) || 2186 is_empty = cpumask_empty(cs->cpus_allowed) ||
2064 nodes_empty(cs->mems_allowed); 2187 nodes_empty(cs->mems_allowed);
@@ -2066,40 +2189,14 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
2066 mutex_unlock(&cpuset_mutex); 2189 mutex_unlock(&cpuset_mutex);
2067 2190
2068 /* 2191 /*
2069 * If @cs became empty, move tasks to the nearest ancestor with 2192 * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
2070 * execution resources. This is full cgroup operation which will 2193 *
2194 * Otherwise move tasks to the nearest ancestor with execution
2195 * resources. This is full cgroup operation which will
2071 * also call back into cpuset. Should be done outside any lock. 2196 * also call back into cpuset. Should be done outside any lock.
2072 */ 2197 */
2073 if (is_empty) 2198 if (!sane && is_empty)
2074 remove_tasks_in_empty_cpuset(cs); 2199 remove_tasks_in_empty_cpuset(cs);
2075
2076 /* the following may free @cs, should be the last operation */
2077 css_put(&cs->css);
2078}
2079
2080/**
2081 * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
2082 * @cs: cpuset of interest
2083 *
2084 * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
2085 * memory masks according to top_cpuset.
2086 */
2087static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
2088{
2089 /*
2090 * Pin @cs. The refcnt will be released when the work item
2091 * finishes executing.
2092 */
2093 if (!css_tryget(&cs->css))
2094 return;
2095
2096 /*
2097 * Queue @cs->hotplug_work. If already pending, lose the css ref.
2098 * cpuset_propagate_hotplug_wq is ordered and propagation will
2099 * happen in the order this function is called.
2100 */
2101 if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
2102 css_put(&cs->css);
2103} 2200}
2104 2201
2105/** 2202/**
@@ -2112,18 +2209,17 @@ static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
2112 * actively using CPU hotplug but making no active use of cpusets. 2209 * actively using CPU hotplug but making no active use of cpusets.
2113 * 2210 *
2114 * Non-root cpusets are only affected by offlining. If any CPUs or memory 2211 * Non-root cpusets are only affected by offlining. If any CPUs or memory
2115 * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all 2212 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
2116 * descendants. 2213 * all descendants.
2117 * 2214 *
2118 * Note that CPU offlining during suspend is ignored. We don't modify 2215 * Note that CPU offlining during suspend is ignored. We don't modify
2119 * cpusets across suspend/resume cycles at all. 2216 * cpusets across suspend/resume cycles at all.
2120 */ 2217 */
2121static void cpuset_hotplug_workfn(struct work_struct *work) 2218static void cpuset_hotplug_workfn(struct work_struct *work)
2122{ 2219{
2123 static cpumask_t new_cpus, tmp_cpus; 2220 static cpumask_t new_cpus;
2124 static nodemask_t new_mems, tmp_mems; 2221 static nodemask_t new_mems;
2125 bool cpus_updated, mems_updated; 2222 bool cpus_updated, mems_updated;
2126 bool cpus_offlined, mems_offlined;
2127 2223
2128 mutex_lock(&cpuset_mutex); 2224 mutex_lock(&cpuset_mutex);
2129 2225
@@ -2132,12 +2228,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2132 new_mems = node_states[N_MEMORY]; 2228 new_mems = node_states[N_MEMORY];
2133 2229
2134 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); 2230 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
2135 cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
2136 &new_cpus);
2137
2138 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); 2231 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
2139 nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
2140 mems_offlined = !nodes_empty(tmp_mems);
2141 2232
2142 /* synchronize cpus_allowed to cpu_active_mask */ 2233 /* synchronize cpus_allowed to cpu_active_mask */
2143 if (cpus_updated) { 2234 if (cpus_updated) {
@@ -2149,28 +2240,32 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2149 2240
2150 /* synchronize mems_allowed to N_MEMORY */ 2241 /* synchronize mems_allowed to N_MEMORY */
2151 if (mems_updated) { 2242 if (mems_updated) {
2152 tmp_mems = top_cpuset.mems_allowed;
2153 mutex_lock(&callback_mutex); 2243 mutex_lock(&callback_mutex);
2154 top_cpuset.mems_allowed = new_mems; 2244 top_cpuset.mems_allowed = new_mems;
2155 mutex_unlock(&callback_mutex); 2245 mutex_unlock(&callback_mutex);
2156 update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); 2246 update_tasks_nodemask(&top_cpuset, NULL);
2157 } 2247 }
2158 2248
2159 /* if cpus or mems went down, we need to propagate to descendants */ 2249 mutex_unlock(&cpuset_mutex);
2160 if (cpus_offlined || mems_offlined) { 2250
2251 /* if cpus or mems changed, we need to propagate to descendants */
2252 if (cpus_updated || mems_updated) {
2161 struct cpuset *cs; 2253 struct cpuset *cs;
2162 struct cgroup *pos_cgrp; 2254 struct cgroup *pos_cgrp;
2163 2255
2164 rcu_read_lock(); 2256 rcu_read_lock();
2165 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) 2257 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) {
2166 schedule_cpuset_propagate_hotplug(cs); 2258 if (!css_tryget(&cs->css))
2167 rcu_read_unlock(); 2259 continue;
2168 } 2260 rcu_read_unlock();
2169 2261
2170 mutex_unlock(&cpuset_mutex); 2262 cpuset_hotplug_update_tasks(cs);
2171 2263
2172 /* wait for propagations to finish */ 2264 rcu_read_lock();
2173 flush_workqueue(cpuset_propagate_hotplug_wq); 2265 css_put(&cs->css);
2266 }
2267 rcu_read_unlock();
2268 }
2174 2269
2175 /* rebuild sched domains if cpus_allowed has changed */ 2270 /* rebuild sched domains if cpus_allowed has changed */
2176 if (cpus_updated) 2271 if (cpus_updated)
@@ -2219,12 +2314,9 @@ void __init cpuset_init_smp(void)
2219{ 2314{
2220 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2315 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2221 top_cpuset.mems_allowed = node_states[N_MEMORY]; 2316 top_cpuset.mems_allowed = node_states[N_MEMORY];
2317 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
2222 2318
2223 register_hotmemory_notifier(&cpuset_track_online_nodes_nb); 2319 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2224
2225 cpuset_propagate_hotplug_wq =
2226 alloc_ordered_workqueue("cpuset_hotplug", 0);
2227 BUG_ON(!cpuset_propagate_hotplug_wq);
2228} 2320}
2229 2321
2230/** 2322/**
@@ -2240,21 +2332,23 @@ void __init cpuset_init_smp(void)
2240 2332
2241void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2333void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2242{ 2334{
2335 struct cpuset *cpus_cs;
2336
2243 mutex_lock(&callback_mutex); 2337 mutex_lock(&callback_mutex);
2244 task_lock(tsk); 2338 task_lock(tsk);
2245 guarantee_online_cpus(task_cs(tsk), pmask); 2339 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
2340 guarantee_online_cpus(cpus_cs, pmask);
2246 task_unlock(tsk); 2341 task_unlock(tsk);
2247 mutex_unlock(&callback_mutex); 2342 mutex_unlock(&callback_mutex);
2248} 2343}
2249 2344
2250void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2345void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2251{ 2346{
2252 const struct cpuset *cs; 2347 const struct cpuset *cpus_cs;
2253 2348
2254 rcu_read_lock(); 2349 rcu_read_lock();
2255 cs = task_cs(tsk); 2350 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
2256 if (cs) 2351 do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
2257 do_set_cpus_allowed(tsk, cs->cpus_allowed);
2258 rcu_read_unlock(); 2352 rcu_read_unlock();
2259 2353
2260 /* 2354 /*
@@ -2293,11 +2387,13 @@ void cpuset_init_current_mems_allowed(void)
2293 2387
2294nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 2388nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2295{ 2389{
2390 struct cpuset *mems_cs;
2296 nodemask_t mask; 2391 nodemask_t mask;
2297 2392
2298 mutex_lock(&callback_mutex); 2393 mutex_lock(&callback_mutex);
2299 task_lock(tsk); 2394 task_lock(tsk);
2300 guarantee_online_mems(task_cs(tsk), &mask); 2395 mems_cs = effective_nodemask_cpuset(task_cs(tsk));
2396 guarantee_online_mems(mems_cs, &mask);
2301 task_unlock(tsk); 2397 task_unlock(tsk);
2302 mutex_unlock(&callback_mutex); 2398 mutex_unlock(&callback_mutex);
2303 2399
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1db3af933704..1833bc5a84a7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -182,7 +182,7 @@ void update_perf_cpu_limits(void)
182 u64 tmp = perf_sample_period_ns; 182 u64 tmp = perf_sample_period_ns;
183 183
184 tmp *= sysctl_perf_cpu_time_max_percent; 184 tmp *= sysctl_perf_cpu_time_max_percent;
185 tmp = do_div(tmp, 100); 185 do_div(tmp, 100);
186 atomic_set(&perf_sample_allowed_ns, tmp); 186 atomic_set(&perf_sample_allowed_ns, tmp);
187} 187}
188 188
@@ -232,7 +232,7 @@ DEFINE_PER_CPU(u64, running_sample_length);
232void perf_sample_event_took(u64 sample_len_ns) 232void perf_sample_event_took(u64 sample_len_ns)
233{ 233{
234 u64 avg_local_sample_len; 234 u64 avg_local_sample_len;
235 u64 local_samples_len = __get_cpu_var(running_sample_length); 235 u64 local_samples_len;
236 236
237 if (atomic_read(&perf_sample_allowed_ns) == 0) 237 if (atomic_read(&perf_sample_allowed_ns) == 0)
238 return; 238 return;
diff --git a/kernel/exit.c b/kernel/exit.c
index 7bb73f9d09db..a949819055d5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -312,17 +312,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
312 } 312 }
313} 313}
314 314
315void __set_special_pids(struct pid *pid)
316{
317 struct task_struct *curr = current->group_leader;
318
319 if (task_session(curr) != pid)
320 change_pid(curr, PIDTYPE_SID, pid);
321
322 if (task_pgrp(curr) != pid)
323 change_pid(curr, PIDTYPE_PGID, pid);
324}
325
326/* 315/*
327 * Let kernel threads use this to say that they allow a certain signal. 316 * Let kernel threads use this to say that they allow a certain signal.
328 * Must not be used if kthread was cloned with CLONE_SIGHAND. 317 * Must not be used if kthread was cloned with CLONE_SIGHAND.
@@ -819,7 +808,7 @@ void do_exit(long code)
819 /* 808 /*
820 * FIXME: do that only when needed, using sched_exit tracepoint 809 * FIXME: do that only when needed, using sched_exit tracepoint
821 */ 810 */
822 ptrace_put_breakpoints(tsk); 811 flush_ptrace_hw_breakpoint(tsk);
823 812
824 exit_notify(tsk, group_dead); 813 exit_notify(tsk, group_dead);
825#ifdef CONFIG_NUMA 814#ifdef CONFIG_NUMA
@@ -835,7 +824,7 @@ void do_exit(long code)
835 /* 824 /*
836 * Make sure we are holding no locks: 825 * Make sure we are holding no locks:
837 */ 826 */
838 debug_check_no_locks_held(tsk); 827 debug_check_no_locks_held();
839 /* 828 /*
840 * We can do this unlocked here. The futex code uses this flag 829 * We can do this unlocked here. The futex code uses this flag
841 * just to verify whether the pi state cleanup has been done 830 * just to verify whether the pi state cleanup has been done
diff --git a/kernel/fork.c b/kernel/fork.c
index 987b28a1f01b..66635c80a813 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -365,8 +365,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
365 mm->locked_vm = 0; 365 mm->locked_vm = 0;
366 mm->mmap = NULL; 366 mm->mmap = NULL;
367 mm->mmap_cache = NULL; 367 mm->mmap_cache = NULL;
368 mm->free_area_cache = oldmm->mmap_base;
369 mm->cached_hole_size = ~0UL;
370 mm->map_count = 0; 368 mm->map_count = 0;
371 cpumask_clear(mm_cpumask(mm)); 369 cpumask_clear(mm_cpumask(mm));
372 mm->mm_rb = RB_ROOT; 370 mm->mm_rb = RB_ROOT;
@@ -540,8 +538,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
540 mm->nr_ptes = 0; 538 mm->nr_ptes = 0;
541 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); 539 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
542 spin_lock_init(&mm->page_table_lock); 540 spin_lock_init(&mm->page_table_lock);
543 mm->free_area_cache = TASK_UNMAPPED_BASE;
544 mm->cached_hole_size = ~0UL;
545 mm_init_aio(mm); 541 mm_init_aio(mm);
546 mm_init_owner(mm, p); 542 mm_init_owner(mm, p);
547 543
@@ -1121,6 +1117,12 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
1121 INIT_LIST_HEAD(&tsk->cpu_timers[2]); 1117 INIT_LIST_HEAD(&tsk->cpu_timers[2]);
1122} 1118}
1123 1119
1120static inline void
1121init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
1122{
1123 task->pids[type].pid = pid;
1124}
1125
1124/* 1126/*
1125 * This creates a new process as a copy of the old one, 1127 * This creates a new process as a copy of the old one,
1126 * but does not actually start it yet. 1128 * but does not actually start it yet.
@@ -1199,8 +1201,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1199 retval = -EAGAIN; 1201 retval = -EAGAIN;
1200 if (atomic_read(&p->real_cred->user->processes) >= 1202 if (atomic_read(&p->real_cred->user->processes) >=
1201 task_rlimit(p, RLIMIT_NPROC)) { 1203 task_rlimit(p, RLIMIT_NPROC)) {
1202 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 1204 if (p->real_cred->user != INIT_USER &&
1203 p->real_cred->user != INIT_USER) 1205 !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
1204 goto bad_fork_free; 1206 goto bad_fork_free;
1205 } 1207 }
1206 current->flags &= ~PF_NPROC_EXCEEDED; 1208 current->flags &= ~PF_NPROC_EXCEEDED;
@@ -1354,11 +1356,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1354 goto bad_fork_cleanup_io; 1356 goto bad_fork_cleanup_io;
1355 } 1357 }
1356 1358
1357 p->pid = pid_nr(pid);
1358 p->tgid = p->pid;
1359 if (clone_flags & CLONE_THREAD)
1360 p->tgid = current->tgid;
1361
1362 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1359 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1363 /* 1360 /*
1364 * Clear TID on mm_release()? 1361 * Clear TID on mm_release()?
@@ -1394,12 +1391,19 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1394 clear_all_latency_tracing(p); 1391 clear_all_latency_tracing(p);
1395 1392
1396 /* ok, now we should be set up.. */ 1393 /* ok, now we should be set up.. */
1397 if (clone_flags & CLONE_THREAD) 1394 p->pid = pid_nr(pid);
1395 if (clone_flags & CLONE_THREAD) {
1398 p->exit_signal = -1; 1396 p->exit_signal = -1;
1399 else if (clone_flags & CLONE_PARENT) 1397 p->group_leader = current->group_leader;
1400 p->exit_signal = current->group_leader->exit_signal; 1398 p->tgid = current->tgid;
1401 else 1399 } else {
1402 p->exit_signal = (clone_flags & CSIGNAL); 1400 if (clone_flags & CLONE_PARENT)
1401 p->exit_signal = current->group_leader->exit_signal;
1402 else
1403 p->exit_signal = (clone_flags & CSIGNAL);
1404 p->group_leader = p;
1405 p->tgid = p->pid;
1406 }
1403 1407
1404 p->pdeath_signal = 0; 1408 p->pdeath_signal = 0;
1405 p->exit_state = 0; 1409 p->exit_state = 0;
@@ -1408,15 +1412,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1408 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); 1412 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
1409 p->dirty_paused_when = 0; 1413 p->dirty_paused_when = 0;
1410 1414
1411 /*
1412 * Ok, make it visible to the rest of the system.
1413 * We dont wake it up yet.
1414 */
1415 p->group_leader = p;
1416 INIT_LIST_HEAD(&p->thread_group); 1415 INIT_LIST_HEAD(&p->thread_group);
1417 p->task_works = NULL; 1416 p->task_works = NULL;
1418 1417
1419 /* Need tasklist lock for parent etc handling! */ 1418 /*
1419 * Make it visible to the rest of the system, but dont wake it up yet.
1420 * Need tasklist lock for parent etc handling!
1421 */
1420 write_lock_irq(&tasklist_lock); 1422 write_lock_irq(&tasklist_lock);
1421 1423
1422 /* CLONE_PARENT re-uses the old parent */ 1424 /* CLONE_PARENT re-uses the old parent */
@@ -1446,18 +1448,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1446 goto bad_fork_free_pid; 1448 goto bad_fork_free_pid;
1447 } 1449 }
1448 1450
1449 if (clone_flags & CLONE_THREAD) {
1450 current->signal->nr_threads++;
1451 atomic_inc(&current->signal->live);
1452 atomic_inc(&current->signal->sigcnt);
1453 p->group_leader = current->group_leader;
1454 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1455 }
1456
1457 if (likely(p->pid)) { 1451 if (likely(p->pid)) {
1458 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); 1452 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
1459 1453
1454 init_task_pid(p, PIDTYPE_PID, pid);
1460 if (thread_group_leader(p)) { 1455 if (thread_group_leader(p)) {
1456 init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
1457 init_task_pid(p, PIDTYPE_SID, task_session(current));
1458
1461 if (is_child_reaper(pid)) { 1459 if (is_child_reaper(pid)) {
1462 ns_of_pid(pid)->child_reaper = p; 1460 ns_of_pid(pid)->child_reaper = p;
1463 p->signal->flags |= SIGNAL_UNKILLABLE; 1461 p->signal->flags |= SIGNAL_UNKILLABLE;
@@ -1465,13 +1463,19 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1465 1463
1466 p->signal->leader_pid = pid; 1464 p->signal->leader_pid = pid;
1467 p->signal->tty = tty_kref_get(current->signal->tty); 1465 p->signal->tty = tty_kref_get(current->signal->tty);
1468 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1469 attach_pid(p, PIDTYPE_SID, task_session(current));
1470 list_add_tail(&p->sibling, &p->real_parent->children); 1466 list_add_tail(&p->sibling, &p->real_parent->children);
1471 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1467 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1468 attach_pid(p, PIDTYPE_PGID);
1469 attach_pid(p, PIDTYPE_SID);
1472 __this_cpu_inc(process_counts); 1470 __this_cpu_inc(process_counts);
1471 } else {
1472 current->signal->nr_threads++;
1473 atomic_inc(&current->signal->live);
1474 atomic_inc(&current->signal->sigcnt);
1475 list_add_tail_rcu(&p->thread_group,
1476 &p->group_leader->thread_group);
1473 } 1477 }
1474 attach_pid(p, PIDTYPE_PID, pid); 1478 attach_pid(p, PIDTYPE_PID);
1475 nr_threads++; 1479 nr_threads++;
1476 } 1480 }
1477 1481
diff --git a/kernel/freezer.c b/kernel/freezer.c
index c38893b0efba..8b2afc1c9df0 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -110,6 +110,18 @@ bool freeze_task(struct task_struct *p)
110{ 110{
111 unsigned long flags; 111 unsigned long flags;
112 112
113 /*
114 * This check can race with freezer_do_not_count, but worst case that
115 * will result in an extra wakeup being sent to the task. It does not
116 * race with freezer_count(), the barriers in freezer_count() and
117 * freezer_should_skip() ensure that either freezer_count() sees
118 * freezing == true in try_to_freeze() and freezes, or
119 * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task
120 * normally.
121 */
122 if (freezer_should_skip(p))
123 return false;
124
113 spin_lock_irqsave(&freezer_lock, flags); 125 spin_lock_irqsave(&freezer_lock, flags);
114 if (!freezing(p) || frozen(p)) { 126 if (!freezing(p) || frozen(p)) {
115 spin_unlock_irqrestore(&freezer_lock, flags); 127 spin_unlock_irqrestore(&freezer_lock, flags);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index fd4b13b131f8..f0f4fe29cd21 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -47,6 +47,7 @@
47#include <linux/sched/sysctl.h> 47#include <linux/sched/sysctl.h>
48#include <linux/sched/rt.h> 48#include <linux/sched/rt.h>
49#include <linux/timer.h> 49#include <linux/timer.h>
50#include <linux/freezer.h>
50 51
51#include <asm/uaccess.h> 52#include <asm/uaccess.h>
52 53
@@ -721,17 +722,20 @@ static int hrtimer_switch_to_hres(void)
721 return 1; 722 return 1;
722} 723}
723 724
725static void clock_was_set_work(struct work_struct *work)
726{
727 clock_was_set();
728}
729
730static DECLARE_WORK(hrtimer_work, clock_was_set_work);
731
724/* 732/*
725 * Called from timekeeping code to reprogramm the hrtimer interrupt 733 * Called from timekeeping and resume code to reprogramm the hrtimer
726 * device. If called from the timer interrupt context we defer it to 734 * interrupt device on all cpus.
727 * softirq context.
728 */ 735 */
729void clock_was_set_delayed(void) 736void clock_was_set_delayed(void)
730{ 737{
731 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 738 schedule_work(&hrtimer_work);
732
733 cpu_base->clock_was_set = 1;
734 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
735} 739}
736 740
737#else 741#else
@@ -773,15 +777,19 @@ void clock_was_set(void)
773 777
774/* 778/*
775 * During resume we might have to reprogram the high resolution timer 779 * During resume we might have to reprogram the high resolution timer
776 * interrupt (on the local CPU): 780 * interrupt on all online CPUs. However, all other CPUs will be
781 * stopped with IRQs interrupts disabled so the clock_was_set() call
782 * must be deferred.
777 */ 783 */
778void hrtimers_resume(void) 784void hrtimers_resume(void)
779{ 785{
780 WARN_ONCE(!irqs_disabled(), 786 WARN_ONCE(!irqs_disabled(),
781 KERN_INFO "hrtimers_resume() called with IRQs enabled!"); 787 KERN_INFO "hrtimers_resume() called with IRQs enabled!");
782 788
789 /* Retrigger on the local CPU */
783 retrigger_next_event(NULL); 790 retrigger_next_event(NULL);
784 timerfd_clock_was_set(); 791 /* And schedule a retrigger for all others */
792 clock_was_set_delayed();
785} 793}
786 794
787static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) 795static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
@@ -1432,13 +1440,6 @@ void hrtimer_peek_ahead_timers(void)
1432 1440
1433static void run_hrtimer_softirq(struct softirq_action *h) 1441static void run_hrtimer_softirq(struct softirq_action *h)
1434{ 1442{
1435 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1436
1437 if (cpu_base->clock_was_set) {
1438 cpu_base->clock_was_set = 0;
1439 clock_was_set();
1440 }
1441
1442 hrtimer_peek_ahead_timers(); 1443 hrtimer_peek_ahead_timers();
1443} 1444}
1444 1445
@@ -1545,7 +1546,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
1545 t->task = NULL; 1546 t->task = NULL;
1546 1547
1547 if (likely(t->task)) 1548 if (likely(t->task))
1548 schedule(); 1549 freezable_schedule();
1549 1550
1550 hrtimer_cancel(&t->timer); 1551 hrtimer_cancel(&t->timer);
1551 mode = HRTIMER_MODE_ABS; 1552 mode = HRTIMER_MODE_ABS;
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 1c39eccc1eaf..10e663ab1f4a 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -135,7 +135,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d)
135} 135}
136 136
137/** 137/**
138 * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt 138 * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt
139 * @d: irq_data 139 * @d: irq_data
140 */ 140 */
141void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) 141void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
@@ -275,10 +275,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
275 if (d->gc) 275 if (d->gc)
276 return -EBUSY; 276 return -EBUSY;
277 277
278 if (d->revmap_type != IRQ_DOMAIN_MAP_LINEAR) 278 numchips = d->revmap_size / irqs_per_chip;
279 return -EINVAL;
280
281 numchips = d->revmap_data.linear.size / irqs_per_chip;
282 if (!numchips) 279 if (!numchips)
283 return -EINVAL; 280 return -EINVAL;
284 281
@@ -310,6 +307,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
310 /* Calc pointer to the next generic chip */ 307 /* Calc pointer to the next generic chip */
311 tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); 308 tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
312 } 309 }
310 d->name = name;
313 return 0; 311 return 0;
314} 312}
315EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); 313EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 1ed8dff17eb9..2d7cd3428365 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -23,9 +23,11 @@ static DEFINE_MUTEX(revmap_trees_mutex);
23static struct irq_domain *irq_default_domain; 23static struct irq_domain *irq_default_domain;
24 24
25/** 25/**
26 * irq_domain_alloc() - Allocate a new irq_domain data structure 26 * __irq_domain_add() - Allocate a new irq_domain data structure
27 * @of_node: optional device-tree node of the interrupt controller 27 * @of_node: optional device-tree node of the interrupt controller
28 * @revmap_type: type of reverse mapping to use 28 * @size: Size of linear map; 0 for radix mapping only
29 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
30 * direct mapping
29 * @ops: map/unmap domain callbacks 31 * @ops: map/unmap domain callbacks
30 * @host_data: Controller private data pointer 32 * @host_data: Controller private data pointer
31 * 33 *
@@ -33,41 +35,35 @@ static struct irq_domain *irq_default_domain;
33 * register allocated irq_domain with irq_domain_register(). Returns pointer 35 * register allocated irq_domain with irq_domain_register(). Returns pointer
34 * to IRQ domain, or NULL on failure. 36 * to IRQ domain, or NULL on failure.
35 */ 37 */
36static struct irq_domain *irq_domain_alloc(struct device_node *of_node, 38struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
37 unsigned int revmap_type, 39 irq_hw_number_t hwirq_max, int direct_max,
38 const struct irq_domain_ops *ops, 40 const struct irq_domain_ops *ops,
39 void *host_data) 41 void *host_data)
40{ 42{
41 struct irq_domain *domain; 43 struct irq_domain *domain;
42 44
43 domain = kzalloc_node(sizeof(*domain), GFP_KERNEL, 45 domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
44 of_node_to_nid(of_node)); 46 GFP_KERNEL, of_node_to_nid(of_node));
45 if (WARN_ON(!domain)) 47 if (WARN_ON(!domain))
46 return NULL; 48 return NULL;
47 49
48 /* Fill structure */ 50 /* Fill structure */
49 domain->revmap_type = revmap_type; 51 INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
50 domain->ops = ops; 52 domain->ops = ops;
51 domain->host_data = host_data; 53 domain->host_data = host_data;
52 domain->of_node = of_node_get(of_node); 54 domain->of_node = of_node_get(of_node);
55 domain->hwirq_max = hwirq_max;
56 domain->revmap_size = size;
57 domain->revmap_direct_max_irq = direct_max;
53 58
54 return domain;
55}
56
57static void irq_domain_free(struct irq_domain *domain)
58{
59 of_node_put(domain->of_node);
60 kfree(domain);
61}
62
63static void irq_domain_add(struct irq_domain *domain)
64{
65 mutex_lock(&irq_domain_mutex); 59 mutex_lock(&irq_domain_mutex);
66 list_add(&domain->link, &irq_domain_list); 60 list_add(&domain->link, &irq_domain_list);
67 mutex_unlock(&irq_domain_mutex); 61 mutex_unlock(&irq_domain_mutex);
68 pr_debug("Allocated domain of type %d @0x%p\n", 62
69 domain->revmap_type, domain); 63 pr_debug("Added domain %s\n", domain->name);
64 return domain;
70} 65}
66EXPORT_SYMBOL_GPL(__irq_domain_add);
71 67
72/** 68/**
73 * irq_domain_remove() - Remove an irq domain. 69 * irq_domain_remove() - Remove an irq domain.
@@ -81,29 +77,12 @@ void irq_domain_remove(struct irq_domain *domain)
81{ 77{
82 mutex_lock(&irq_domain_mutex); 78 mutex_lock(&irq_domain_mutex);
83 79
84 switch (domain->revmap_type) { 80 /*
85 case IRQ_DOMAIN_MAP_LEGACY: 81 * radix_tree_delete() takes care of destroying the root
86 /* 82 * node when all entries are removed. Shout if there are
87 * Legacy domains don't manage their own irq_desc 83 * any mappings left.
88 * allocations, we expect the caller to handle irq_desc 84 */
89 * freeing on their own. 85 WARN_ON(domain->revmap_tree.height);
90 */
91 break;
92 case IRQ_DOMAIN_MAP_TREE:
93 /*
94 * radix_tree_delete() takes care of destroying the root
95 * node when all entries are removed. Shout if there are
96 * any mappings left.
97 */
98 WARN_ON(domain->revmap_data.tree.height);
99 break;
100 case IRQ_DOMAIN_MAP_LINEAR:
101 kfree(domain->revmap_data.linear.revmap);
102 domain->revmap_data.linear.size = 0;
103 break;
104 case IRQ_DOMAIN_MAP_NOMAP:
105 break;
106 }
107 86
108 list_del(&domain->link); 87 list_del(&domain->link);
109 88
@@ -115,44 +94,30 @@ void irq_domain_remove(struct irq_domain *domain)
115 94
116 mutex_unlock(&irq_domain_mutex); 95 mutex_unlock(&irq_domain_mutex);
117 96
118 pr_debug("Removed domain of type %d @0x%p\n", 97 pr_debug("Removed domain %s\n", domain->name);
119 domain->revmap_type, domain);
120 98
121 irq_domain_free(domain); 99 of_node_put(domain->of_node);
100 kfree(domain);
122} 101}
123EXPORT_SYMBOL_GPL(irq_domain_remove); 102EXPORT_SYMBOL_GPL(irq_domain_remove);
124 103
125static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
126 irq_hw_number_t hwirq)
127{
128 irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq;
129 int size = domain->revmap_data.legacy.size;
130
131 if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size))
132 return 0;
133 return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq;
134}
135
136/** 104/**
137 * irq_domain_add_simple() - Allocate and register a simple irq_domain. 105 * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs
138 * @of_node: pointer to interrupt controller's device tree node. 106 * @of_node: pointer to interrupt controller's device tree node.
139 * @size: total number of irqs in mapping 107 * @size: total number of irqs in mapping
140 * @first_irq: first number of irq block assigned to the domain, 108 * @first_irq: first number of irq block assigned to the domain,
141 * pass zero to assign irqs on-the-fly. This will result in a 109 * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then
142 * linear IRQ domain so it is important to use irq_create_mapping() 110 * pre-map all of the irqs in the domain to virqs starting at first_irq.
143 * for each used IRQ, especially when SPARSE_IRQ is enabled.
144 * @ops: map/unmap domain callbacks 111 * @ops: map/unmap domain callbacks
145 * @host_data: Controller private data pointer 112 * @host_data: Controller private data pointer
146 * 113 *
147 * Allocates a legacy irq_domain if irq_base is positive or a linear 114 * Allocates an irq_domain, and optionally if first_irq is positive then also
148 * domain otherwise. For the legacy domain, IRQ descriptors will also 115 * allocate irq_descs and map all of the hwirqs to virqs starting at first_irq.
149 * be allocated.
150 * 116 *
151 * This is intended to implement the expected behaviour for most 117 * This is intended to implement the expected behaviour for most
152 * interrupt controllers which is that a linear mapping should 118 * interrupt controllers. If device tree is used, then first_irq will be 0 and
153 * normally be used unless the system requires a legacy mapping in 119 * irqs get mapped dynamically on the fly. However, if the controller requires
154 * order to support supplying interrupt numbers during non-DT 120 * static virq assignments (non-DT boot) then it will set that up correctly.
155 * registration of devices.
156 */ 121 */
157struct irq_domain *irq_domain_add_simple(struct device_node *of_node, 122struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
158 unsigned int size, 123 unsigned int size,
@@ -160,33 +125,25 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
160 const struct irq_domain_ops *ops, 125 const struct irq_domain_ops *ops,
161 void *host_data) 126 void *host_data)
162{ 127{
163 if (first_irq > 0) { 128 struct irq_domain *domain;
164 int irq_base; 129
130 domain = __irq_domain_add(of_node, size, size, 0, ops, host_data);
131 if (!domain)
132 return NULL;
165 133
134 if (first_irq > 0) {
166 if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { 135 if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
167 /* 136 /* attempt to allocated irq_descs */
168 * Set the descriptor allocator to search for a 137 int rc = irq_alloc_descs(first_irq, first_irq, size,
169 * 1-to-1 mapping, such as irq_alloc_desc_at(). 138 of_node_to_nid(of_node));
170 * Use of_node_to_nid() which is defined to 139 if (rc < 0)
171 * numa_node_id() on platforms that have no custom
172 * implementation.
173 */
174 irq_base = irq_alloc_descs(first_irq, first_irq, size,
175 of_node_to_nid(of_node));
176 if (irq_base < 0) {
177 pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", 140 pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
178 first_irq); 141 first_irq);
179 irq_base = first_irq; 142 }
180 } 143 irq_domain_associate_many(domain, first_irq, 0, size);
181 } else
182 irq_base = first_irq;
183
184 return irq_domain_add_legacy(of_node, size, irq_base, 0,
185 ops, host_data);
186 } 144 }
187 145
188 /* A linear domain is the default */ 146 return domain;
189 return irq_domain_add_linear(of_node, size, ops, host_data);
190} 147}
191EXPORT_SYMBOL_GPL(irq_domain_add_simple); 148EXPORT_SYMBOL_GPL(irq_domain_add_simple);
192 149
@@ -213,131 +170,19 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
213 void *host_data) 170 void *host_data)
214{ 171{
215 struct irq_domain *domain; 172 struct irq_domain *domain;
216 unsigned int i;
217 173
218 domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data); 174 domain = __irq_domain_add(of_node, first_hwirq + size,
175 first_hwirq + size, 0, ops, host_data);
219 if (!domain) 176 if (!domain)
220 return NULL; 177 return NULL;
221 178
222 domain->revmap_data.legacy.first_irq = first_irq; 179 irq_domain_associate_many(domain, first_irq, first_hwirq, size);
223 domain->revmap_data.legacy.first_hwirq = first_hwirq;
224 domain->revmap_data.legacy.size = size;
225 180
226 mutex_lock(&irq_domain_mutex);
227 /* Verify that all the irqs are available */
228 for (i = 0; i < size; i++) {
229 int irq = first_irq + i;
230 struct irq_data *irq_data = irq_get_irq_data(irq);
231
232 if (WARN_ON(!irq_data || irq_data->domain)) {
233 mutex_unlock(&irq_domain_mutex);
234 irq_domain_free(domain);
235 return NULL;
236 }
237 }
238
239 /* Claim all of the irqs before registering a legacy domain */
240 for (i = 0; i < size; i++) {
241 struct irq_data *irq_data = irq_get_irq_data(first_irq + i);
242 irq_data->hwirq = first_hwirq + i;
243 irq_data->domain = domain;
244 }
245 mutex_unlock(&irq_domain_mutex);
246
247 for (i = 0; i < size; i++) {
248 int irq = first_irq + i;
249 int hwirq = first_hwirq + i;
250
251 /* IRQ0 gets ignored */
252 if (!irq)
253 continue;
254
255 /* Legacy flags are left to default at this point,
256 * one can then use irq_create_mapping() to
257 * explicitly change them
258 */
259 if (ops->map)
260 ops->map(domain, irq, hwirq);
261
262 /* Clear norequest flags */
263 irq_clear_status_flags(irq, IRQ_NOREQUEST);
264 }
265
266 irq_domain_add(domain);
267 return domain; 181 return domain;
268} 182}
269EXPORT_SYMBOL_GPL(irq_domain_add_legacy); 183EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
270 184
271/** 185/**
272 * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain.
273 * @of_node: pointer to interrupt controller's device tree node.
274 * @size: Number of interrupts in the domain.
275 * @ops: map/unmap domain callbacks
276 * @host_data: Controller private data pointer
277 */
278struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
279 unsigned int size,
280 const struct irq_domain_ops *ops,
281 void *host_data)
282{
283 struct irq_domain *domain;
284 unsigned int *revmap;
285
286 revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL,
287 of_node_to_nid(of_node));
288 if (WARN_ON(!revmap))
289 return NULL;
290
291 domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data);
292 if (!domain) {
293 kfree(revmap);
294 return NULL;
295 }
296 domain->revmap_data.linear.size = size;
297 domain->revmap_data.linear.revmap = revmap;
298 irq_domain_add(domain);
299 return domain;
300}
301EXPORT_SYMBOL_GPL(irq_domain_add_linear);
302
303struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
304 unsigned int max_irq,
305 const struct irq_domain_ops *ops,
306 void *host_data)
307{
308 struct irq_domain *domain = irq_domain_alloc(of_node,
309 IRQ_DOMAIN_MAP_NOMAP, ops, host_data);
310 if (domain) {
311 domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0;
312 irq_domain_add(domain);
313 }
314 return domain;
315}
316EXPORT_SYMBOL_GPL(irq_domain_add_nomap);
317
318/**
319 * irq_domain_add_tree()
320 * @of_node: pointer to interrupt controller's device tree node.
321 * @ops: map/unmap domain callbacks
322 *
323 * Note: The radix tree will be allocated later during boot automatically
324 * (the reverse mapping will use the slow path until that happens).
325 */
326struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
327 const struct irq_domain_ops *ops,
328 void *host_data)
329{
330 struct irq_domain *domain = irq_domain_alloc(of_node,
331 IRQ_DOMAIN_MAP_TREE, ops, host_data);
332 if (domain) {
333 INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL);
334 irq_domain_add(domain);
335 }
336 return domain;
337}
338EXPORT_SYMBOL_GPL(irq_domain_add_tree);
339
340/**
341 * irq_find_host() - Locates a domain for a given device node 186 * irq_find_host() - Locates a domain for a given device node
342 * @node: device-tree node of the interrupt controller 187 * @node: device-tree node of the interrupt controller
343 */ 188 */
@@ -385,125 +230,108 @@ void irq_set_default_host(struct irq_domain *domain)
385} 230}
386EXPORT_SYMBOL_GPL(irq_set_default_host); 231EXPORT_SYMBOL_GPL(irq_set_default_host);
387 232
388static void irq_domain_disassociate_many(struct irq_domain *domain, 233static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
389 unsigned int irq_base, int count)
390{ 234{
391 /* 235 struct irq_data *irq_data = irq_get_irq_data(irq);
392 * disassociate in reverse order; 236 irq_hw_number_t hwirq;
393 * not strictly necessary, but nice for unwinding
394 */
395 while (count--) {
396 int irq = irq_base + count;
397 struct irq_data *irq_data = irq_get_irq_data(irq);
398 irq_hw_number_t hwirq;
399 237
400 if (WARN_ON(!irq_data || irq_data->domain != domain)) 238 if (WARN(!irq_data || irq_data->domain != domain,
401 continue; 239 "virq%i doesn't exist; cannot disassociate\n", irq))
240 return;
402 241
403 hwirq = irq_data->hwirq; 242 hwirq = irq_data->hwirq;
404 irq_set_status_flags(irq, IRQ_NOREQUEST); 243 irq_set_status_flags(irq, IRQ_NOREQUEST);
405 244
406 /* remove chip and handler */ 245 /* remove chip and handler */
407 irq_set_chip_and_handler(irq, NULL, NULL); 246 irq_set_chip_and_handler(irq, NULL, NULL);
408 247
409 /* Make sure it's completed */ 248 /* Make sure it's completed */
410 synchronize_irq(irq); 249 synchronize_irq(irq);
411 250
412 /* Tell the PIC about it */ 251 /* Tell the PIC about it */
413 if (domain->ops->unmap) 252 if (domain->ops->unmap)
414 domain->ops->unmap(domain, irq); 253 domain->ops->unmap(domain, irq);
415 smp_mb(); 254 smp_mb();
416 255
417 irq_data->domain = NULL; 256 irq_data->domain = NULL;
418 irq_data->hwirq = 0; 257 irq_data->hwirq = 0;
419 258
420 /* Clear reverse map */ 259 /* Clear reverse map for this hwirq */
421 switch(domain->revmap_type) { 260 if (hwirq < domain->revmap_size) {
422 case IRQ_DOMAIN_MAP_LINEAR: 261 domain->linear_revmap[hwirq] = 0;
423 if (hwirq < domain->revmap_data.linear.size) 262 } else {
424 domain->revmap_data.linear.revmap[hwirq] = 0; 263 mutex_lock(&revmap_trees_mutex);
425 break; 264 radix_tree_delete(&domain->revmap_tree, hwirq);
426 case IRQ_DOMAIN_MAP_TREE: 265 mutex_unlock(&revmap_trees_mutex);
427 mutex_lock(&revmap_trees_mutex);
428 radix_tree_delete(&domain->revmap_data.tree, hwirq);
429 mutex_unlock(&revmap_trees_mutex);
430 break;
431 }
432 } 266 }
433} 267}
434 268
435int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, 269int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
436 irq_hw_number_t hwirq_base, int count) 270 irq_hw_number_t hwirq)
437{ 271{
438 unsigned int virq = irq_base; 272 struct irq_data *irq_data = irq_get_irq_data(virq);
439 irq_hw_number_t hwirq = hwirq_base; 273 int ret;
440 int i, ret;
441 274
442 pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, 275 if (WARN(hwirq >= domain->hwirq_max,
443 of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); 276 "error: hwirq 0x%x is too large for %s\n", (int)hwirq, domain->name))
277 return -EINVAL;
278 if (WARN(!irq_data, "error: virq%i is not allocated", virq))
279 return -EINVAL;
280 if (WARN(irq_data->domain, "error: virq%i is already associated", virq))
281 return -EINVAL;
444 282
445 for (i = 0; i < count; i++) { 283 mutex_lock(&irq_domain_mutex);
446 struct irq_data *irq_data = irq_get_irq_data(virq + i); 284 irq_data->hwirq = hwirq;
447 285 irq_data->domain = domain;
448 if (WARN(!irq_data, "error: irq_desc not allocated; " 286 if (domain->ops->map) {
449 "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) 287 ret = domain->ops->map(domain, virq, hwirq);
450 return -EINVAL; 288 if (ret != 0) {
451 if (WARN(irq_data->domain, "error: irq_desc already associated; " 289 /*
452 "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) 290 * If map() returns -EPERM, this interrupt is protected
453 return -EINVAL; 291 * by the firmware or some other service and shall not
454 }; 292 * be mapped. Don't bother telling the user about it.
455 293 */
456 for (i = 0; i < count; i++, virq++, hwirq++) { 294 if (ret != -EPERM) {
457 struct irq_data *irq_data = irq_get_irq_data(virq); 295 pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n",
458 296 domain->name, hwirq, virq, ret);
459 irq_data->hwirq = hwirq;
460 irq_data->domain = domain;
461 if (domain->ops->map) {
462 ret = domain->ops->map(domain, virq, hwirq);
463 if (ret != 0) {
464 /*
465 * If map() returns -EPERM, this interrupt is protected
466 * by the firmware or some other service and shall not
467 * be mapped.
468 *
469 * Since on some platforms we blindly try to map everything
470 * we end up with a log full of backtraces.
471 *
472 * So instead, we silently fail on -EPERM, it is the
473 * responsibility of the PIC driver to display a relevant
474 * message if needed.
475 */
476 if (ret != -EPERM) {
477 pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
478 virq, hwirq, ret);
479 WARN_ON(1);
480 }
481 irq_data->domain = NULL;
482 irq_data->hwirq = 0;
483 goto err_unmap;
484 } 297 }
298 irq_data->domain = NULL;
299 irq_data->hwirq = 0;
300 mutex_unlock(&irq_domain_mutex);
301 return ret;
485 } 302 }
486 303
487 switch (domain->revmap_type) { 304 /* If not already assigned, give the domain the chip's name */
488 case IRQ_DOMAIN_MAP_LINEAR: 305 if (!domain->name && irq_data->chip)
489 if (hwirq < domain->revmap_data.linear.size) 306 domain->name = irq_data->chip->name;
490 domain->revmap_data.linear.revmap[hwirq] = virq; 307 }
491 break;
492 case IRQ_DOMAIN_MAP_TREE:
493 mutex_lock(&revmap_trees_mutex);
494 radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
495 mutex_unlock(&revmap_trees_mutex);
496 break;
497 }
498 308
499 irq_clear_status_flags(virq, IRQ_NOREQUEST); 309 if (hwirq < domain->revmap_size) {
310 domain->linear_revmap[hwirq] = virq;
311 } else {
312 mutex_lock(&revmap_trees_mutex);
313 radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
314 mutex_unlock(&revmap_trees_mutex);
500 } 315 }
316 mutex_unlock(&irq_domain_mutex);
317
318 irq_clear_status_flags(virq, IRQ_NOREQUEST);
501 319
502 return 0; 320 return 0;
321}
322EXPORT_SYMBOL_GPL(irq_domain_associate);
503 323
504 err_unmap: 324void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
505 irq_domain_disassociate_many(domain, irq_base, i); 325 irq_hw_number_t hwirq_base, int count)
506 return -EINVAL; 326{
327 int i;
328
329 pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
330 of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count);
331
332 for (i = 0; i < count; i++) {
333 irq_domain_associate(domain, irq_base + i, hwirq_base + i);
334 }
507} 335}
508EXPORT_SYMBOL_GPL(irq_domain_associate_many); 336EXPORT_SYMBOL_GPL(irq_domain_associate_many);
509 337
@@ -513,7 +341,9 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many);
513 * 341 *
514 * This routine is used for irq controllers which can choose the hardware 342 * This routine is used for irq controllers which can choose the hardware
515 * interrupt numbers they generate. In such a case it's simplest to use 343 * interrupt numbers they generate. In such a case it's simplest to use
516 * the linux irq as the hardware interrupt number. 344 * the linux irq as the hardware interrupt number. It still uses the linear
345 * or radix tree to store the mapping, but the irq controller can optimize
346 * the revmap path by using the hwirq directly.
517 */ 347 */
518unsigned int irq_create_direct_mapping(struct irq_domain *domain) 348unsigned int irq_create_direct_mapping(struct irq_domain *domain)
519{ 349{
@@ -522,17 +352,14 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
522 if (domain == NULL) 352 if (domain == NULL)
523 domain = irq_default_domain; 353 domain = irq_default_domain;
524 354
525 if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP))
526 return 0;
527
528 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); 355 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
529 if (!virq) { 356 if (!virq) {
530 pr_debug("create_direct virq allocation failed\n"); 357 pr_debug("create_direct virq allocation failed\n");
531 return 0; 358 return 0;
532 } 359 }
533 if (virq >= domain->revmap_data.nomap.max_irq) { 360 if (virq >= domain->revmap_direct_max_irq) {
534 pr_err("ERROR: no free irqs available below %i maximum\n", 361 pr_err("ERROR: no free irqs available below %i maximum\n",
535 domain->revmap_data.nomap.max_irq); 362 domain->revmap_direct_max_irq);
536 irq_free_desc(virq); 363 irq_free_desc(virq);
537 return 0; 364 return 0;
538 } 365 }
@@ -569,9 +396,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
569 if (domain == NULL) 396 if (domain == NULL)
570 domain = irq_default_domain; 397 domain = irq_default_domain;
571 if (domain == NULL) { 398 if (domain == NULL) {
572 pr_warning("irq_create_mapping called for" 399 WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq);
573 " NULL domain, hwirq=%lx\n", hwirq);
574 WARN_ON(1);
575 return 0; 400 return 0;
576 } 401 }
577 pr_debug("-> using domain @%p\n", domain); 402 pr_debug("-> using domain @%p\n", domain);
@@ -583,10 +408,6 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
583 return virq; 408 return virq;
584 } 409 }
585 410
586 /* Get a virtual interrupt number */
587 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
588 return irq_domain_legacy_revmap(domain, hwirq);
589
590 /* Allocate a virtual interrupt number */ 411 /* Allocate a virtual interrupt number */
591 hint = hwirq % nr_irqs; 412 hint = hwirq % nr_irqs;
592 if (hint == 0) 413 if (hint == 0)
@@ -639,12 +460,7 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
639 if (unlikely(ret < 0)) 460 if (unlikely(ret < 0))
640 return ret; 461 return ret;
641 462
642 ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count); 463 irq_domain_associate_many(domain, irq_base, hwirq_base, count);
643 if (unlikely(ret < 0)) {
644 irq_free_descs(irq_base, count);
645 return ret;
646 }
647
648 return 0; 464 return 0;
649} 465}
650EXPORT_SYMBOL_GPL(irq_create_strict_mappings); 466EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
@@ -671,8 +487,8 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
671 if (intsize > 0) 487 if (intsize > 0)
672 return intspec[0]; 488 return intspec[0];
673#endif 489#endif
674 pr_warning("no irq domain found for %s !\n", 490 pr_warn("no irq domain found for %s !\n",
675 of_node_full_name(controller)); 491 of_node_full_name(controller));
676 return 0; 492 return 0;
677 } 493 }
678 494
@@ -714,11 +530,7 @@ void irq_dispose_mapping(unsigned int virq)
714 if (WARN_ON(domain == NULL)) 530 if (WARN_ON(domain == NULL))
715 return; 531 return;
716 532
717 /* Never unmap legacy interrupts */ 533 irq_domain_disassociate(domain, virq);
718 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
719 return;
720
721 irq_domain_disassociate_many(domain, virq, 1);
722 irq_free_desc(virq); 534 irq_free_desc(virq);
723} 535}
724EXPORT_SYMBOL_GPL(irq_dispose_mapping); 536EXPORT_SYMBOL_GPL(irq_dispose_mapping);
@@ -739,63 +551,51 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
739 if (domain == NULL) 551 if (domain == NULL)
740 return 0; 552 return 0;
741 553
742 switch (domain->revmap_type) { 554 if (hwirq < domain->revmap_direct_max_irq) {
743 case IRQ_DOMAIN_MAP_LEGACY:
744 return irq_domain_legacy_revmap(domain, hwirq);
745 case IRQ_DOMAIN_MAP_LINEAR:
746 return irq_linear_revmap(domain, hwirq);
747 case IRQ_DOMAIN_MAP_TREE:
748 rcu_read_lock();
749 data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
750 rcu_read_unlock();
751 if (data)
752 return data->irq;
753 break;
754 case IRQ_DOMAIN_MAP_NOMAP:
755 data = irq_get_irq_data(hwirq); 555 data = irq_get_irq_data(hwirq);
756 if (data && (data->domain == domain) && (data->hwirq == hwirq)) 556 if (data && (data->domain == domain) && (data->hwirq == hwirq))
757 return hwirq; 557 return hwirq;
758 break;
759 } 558 }
760 559
761 return 0; 560 /* Check if the hwirq is in the linear revmap. */
762} 561 if (hwirq < domain->revmap_size)
763EXPORT_SYMBOL_GPL(irq_find_mapping); 562 return domain->linear_revmap[hwirq];
764 563
765/** 564 rcu_read_lock();
766 * irq_linear_revmap() - Find a linux irq from a hw irq number. 565 data = radix_tree_lookup(&domain->revmap_tree, hwirq);
767 * @domain: domain owning this hardware interrupt 566 rcu_read_unlock();
768 * @hwirq: hardware irq number in that domain space 567 return data ? data->irq : 0;
769 *
770 * This is a fast path that can be called directly by irq controller code to
771 * save a handful of instructions.
772 */
773unsigned int irq_linear_revmap(struct irq_domain *domain,
774 irq_hw_number_t hwirq)
775{
776 BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR);
777
778 /* Check revmap bounds; complain if exceeded */
779 if (WARN_ON(hwirq >= domain->revmap_data.linear.size))
780 return 0;
781
782 return domain->revmap_data.linear.revmap[hwirq];
783} 568}
784EXPORT_SYMBOL_GPL(irq_linear_revmap); 569EXPORT_SYMBOL_GPL(irq_find_mapping);
785 570
786#ifdef CONFIG_IRQ_DOMAIN_DEBUG 571#ifdef CONFIG_IRQ_DOMAIN_DEBUG
787static int virq_debug_show(struct seq_file *m, void *private) 572static int virq_debug_show(struct seq_file *m, void *private)
788{ 573{
789 unsigned long flags; 574 unsigned long flags;
790 struct irq_desc *desc; 575 struct irq_desc *desc;
791 const char *p; 576 struct irq_domain *domain;
792 static const char none[] = "none"; 577 struct radix_tree_iter iter;
793 void *data; 578 void *data, **slot;
794 int i; 579 int i;
795 580
796 seq_printf(m, "%-5s %-7s %-15s %-*s %s\n", "irq", "hwirq", 581 seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
582 "name", "mapped", "linear-max", "direct-max", "devtree-node");
583 mutex_lock(&irq_domain_mutex);
584 list_for_each_entry(domain, &irq_domain_list, link) {
585 int count = 0;
586 radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0)
587 count++;
588 seq_printf(m, "%c%-16s %6u %10u %10u %s\n",
589 domain == irq_default_domain ? '*' : ' ', domain->name,
590 domain->revmap_size + count, domain->revmap_size,
591 domain->revmap_direct_max_irq,
592 domain->of_node ? of_node_full_name(domain->of_node) : "");
593 }
594 mutex_unlock(&irq_domain_mutex);
595
596 seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq",
797 "chip name", (int)(2 * sizeof(void *) + 2), "chip data", 597 "chip name", (int)(2 * sizeof(void *) + 2), "chip data",
798 "domain name"); 598 "active", "type", "domain");
799 599
800 for (i = 1; i < nr_irqs; i++) { 600 for (i = 1; i < nr_irqs; i++) {
801 desc = irq_to_desc(i); 601 desc = irq_to_desc(i);
@@ -803,28 +603,28 @@ static int virq_debug_show(struct seq_file *m, void *private)
803 continue; 603 continue;
804 604
805 raw_spin_lock_irqsave(&desc->lock, flags); 605 raw_spin_lock_irqsave(&desc->lock, flags);
606 domain = desc->irq_data.domain;
806 607
807 if (desc->action && desc->action->handler) { 608 if (domain) {
808 struct irq_chip *chip; 609 struct irq_chip *chip;
610 int hwirq = desc->irq_data.hwirq;
611 bool direct;
809 612
810 seq_printf(m, "%5d ", i); 613 seq_printf(m, "%5d ", i);
811 seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); 614 seq_printf(m, "0x%05x ", hwirq);
812 615
813 chip = irq_desc_get_chip(desc); 616 chip = irq_desc_get_chip(desc);
814 if (chip && chip->name) 617 seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
815 p = chip->name;
816 else
817 p = none;
818 seq_printf(m, "%-15s ", p);
819 618
820 data = irq_desc_get_chip_data(desc); 619 data = irq_desc_get_chip_data(desc);
821 seq_printf(m, data ? "0x%p " : " %p ", data); 620 seq_printf(m, data ? "0x%p " : " %p ", data);
822 621
823 if (desc->irq_data.domain) 622 seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
824 p = of_node_full_name(desc->irq_data.domain->of_node); 623 direct = (i == hwirq) && (i < domain->revmap_direct_max_irq);
825 else 624 seq_printf(m, "%6s%-8s ",
826 p = none; 625 (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
827 seq_printf(m, "%s\n", p); 626 direct ? "(DIRECT)" : "");
627 seq_printf(m, "%s\n", desc->irq_data.domain->name);
828 } 628 }
829 629
830 raw_spin_unlock_irqrestore(&desc->lock, flags); 630 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -921,18 +721,3 @@ const struct irq_domain_ops irq_domain_simple_ops = {
921 .xlate = irq_domain_xlate_onetwocell, 721 .xlate = irq_domain_xlate_onetwocell,
922}; 722};
923EXPORT_SYMBOL_GPL(irq_domain_simple_ops); 723EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
924
925#ifdef CONFIG_OF_IRQ
926void irq_domain_generate_simple(const struct of_device_id *match,
927 u64 phys_base, unsigned int irq_start)
928{
929 struct device_node *node;
930 pr_debug("looking for phys_base=%llx, irq_start=%i\n",
931 (unsigned long long) phys_base, (int) irq_start);
932 node = of_find_matching_node_by_address(NULL, match, phys_base);
933 if (node)
934 irq_domain_add_legacy(node, 32, irq_start, 0,
935 &irq_domain_simple_ops, NULL);
936}
937EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
938#endif
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 19ed5c425c3b..36f6ee181b0c 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -462,6 +462,8 @@ int show_interrupts(struct seq_file *p, void *v)
462 } else { 462 } else {
463 seq_printf(p, " %8s", "None"); 463 seq_printf(p, " %8s", "None");
464 } 464 }
465 if (desc->irq_data.domain)
466 seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);
465#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL 467#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
466 seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); 468 seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
467#endif 469#endif
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8241906c4b61..fb326365b694 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -147,6 +147,9 @@ int __request_module(bool wait, const char *fmt, ...)
147 */ 147 */
148 WARN_ON_ONCE(wait && current_is_async()); 148 WARN_ON_ONCE(wait && current_is_async());
149 149
150 if (!modprobe_path[0])
151 return 0;
152
150 va_start(args, fmt); 153 va_start(args, fmt);
151 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 154 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
152 va_end(args); 155 va_end(args);
@@ -569,14 +572,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
569 int retval = 0; 572 int retval = 0;
570 573
571 helper_lock(); 574 helper_lock();
572 if (!sub_info->path) {
573 retval = -EINVAL;
574 goto out;
575 }
576
577 if (sub_info->path[0] == '\0')
578 goto out;
579
580 if (!khelper_wq || usermodehelper_disabled) { 575 if (!khelper_wq || usermodehelper_disabled) {
581 retval = -EBUSY; 576 retval = -EBUSY;
582 goto out; 577 goto out;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index bddf3b201a48..6e33498d665c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2332,6 +2332,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
2332 if (copy_from_user(buf, user_buf, buf_size)) 2332 if (copy_from_user(buf, user_buf, buf_size))
2333 return -EFAULT; 2333 return -EFAULT;
2334 2334
2335 buf[buf_size] = '\0';
2335 switch (buf[0]) { 2336 switch (buf[0]) {
2336 case 'y': 2337 case 'y':
2337 case 'Y': 2338 case 'Y':
@@ -2343,6 +2344,8 @@ static ssize_t write_enabled_file_bool(struct file *file,
2343 case '0': 2344 case '0':
2344 disarm_all_kprobes(); 2345 disarm_all_kprobes();
2345 break; 2346 break;
2347 default:
2348 return -EINVAL;
2346 } 2349 }
2347 2350
2348 return count; 2351 return count;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1f3186b37fd5..e16c45b9ee77 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -4090,7 +4090,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
4090} 4090}
4091EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); 4091EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
4092 4092
4093static void print_held_locks_bug(struct task_struct *curr) 4093static void print_held_locks_bug(void)
4094{ 4094{
4095 if (!debug_locks_off()) 4095 if (!debug_locks_off())
4096 return; 4096 return;
@@ -4099,22 +4099,21 @@ static void print_held_locks_bug(struct task_struct *curr)
4099 4099
4100 printk("\n"); 4100 printk("\n");
4101 printk("=====================================\n"); 4101 printk("=====================================\n");
4102 printk("[ BUG: lock held at task exit time! ]\n"); 4102 printk("[ BUG: %s/%d still has locks held! ]\n",
4103 current->comm, task_pid_nr(current));
4103 print_kernel_ident(); 4104 print_kernel_ident();
4104 printk("-------------------------------------\n"); 4105 printk("-------------------------------------\n");
4105 printk("%s/%d is exiting with locks still held!\n", 4106 lockdep_print_held_locks(current);
4106 curr->comm, task_pid_nr(curr));
4107 lockdep_print_held_locks(curr);
4108
4109 printk("\nstack backtrace:\n"); 4107 printk("\nstack backtrace:\n");
4110 dump_stack(); 4108 dump_stack();
4111} 4109}
4112 4110
4113void debug_check_no_locks_held(struct task_struct *task) 4111void debug_check_no_locks_held(void)
4114{ 4112{
4115 if (unlikely(task->lockdep_depth > 0)) 4113 if (unlikely(current->lockdep_depth > 0))
4116 print_held_locks_bug(task); 4114 print_held_locks_bug();
4117} 4115}
4116EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
4118 4117
4119void debug_show_all_locks(void) 4118void debug_show_all_locks(void)
4120{ 4119{
diff --git a/kernel/module.c b/kernel/module.c
index cab4bce49c23..206915830d29 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -455,7 +455,7 @@ const struct kernel_symbol *find_symbol(const char *name,
455EXPORT_SYMBOL_GPL(find_symbol); 455EXPORT_SYMBOL_GPL(find_symbol);
456 456
457/* Search for module by name: must hold module_mutex. */ 457/* Search for module by name: must hold module_mutex. */
458static struct module *find_module_all(const char *name, 458static struct module *find_module_all(const char *name, size_t len,
459 bool even_unformed) 459 bool even_unformed)
460{ 460{
461 struct module *mod; 461 struct module *mod;
@@ -463,7 +463,7 @@ static struct module *find_module_all(const char *name,
463 list_for_each_entry(mod, &modules, list) { 463 list_for_each_entry(mod, &modules, list) {
464 if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) 464 if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
465 continue; 465 continue;
466 if (strcmp(mod->name, name) == 0) 466 if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
467 return mod; 467 return mod;
468 } 468 }
469 return NULL; 469 return NULL;
@@ -471,7 +471,7 @@ static struct module *find_module_all(const char *name,
471 471
472struct module *find_module(const char *name) 472struct module *find_module(const char *name)
473{ 473{
474 return find_module_all(name, false); 474 return find_module_all(name, strlen(name), false);
475} 475}
476EXPORT_SYMBOL_GPL(find_module); 476EXPORT_SYMBOL_GPL(find_module);
477 477
@@ -482,23 +482,28 @@ static inline void __percpu *mod_percpu(struct module *mod)
482 return mod->percpu; 482 return mod->percpu;
483} 483}
484 484
485static int percpu_modalloc(struct module *mod, 485static int percpu_modalloc(struct module *mod, struct load_info *info)
486 unsigned long size, unsigned long align)
487{ 486{
487 Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
488 unsigned long align = pcpusec->sh_addralign;
489
490 if (!pcpusec->sh_size)
491 return 0;
492
488 if (align > PAGE_SIZE) { 493 if (align > PAGE_SIZE) {
489 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", 494 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
490 mod->name, align, PAGE_SIZE); 495 mod->name, align, PAGE_SIZE);
491 align = PAGE_SIZE; 496 align = PAGE_SIZE;
492 } 497 }
493 498
494 mod->percpu = __alloc_reserved_percpu(size, align); 499 mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
495 if (!mod->percpu) { 500 if (!mod->percpu) {
496 printk(KERN_WARNING 501 printk(KERN_WARNING
497 "%s: Could not allocate %lu bytes percpu data\n", 502 "%s: Could not allocate %lu bytes percpu data\n",
498 mod->name, size); 503 mod->name, (unsigned long)pcpusec->sh_size);
499 return -ENOMEM; 504 return -ENOMEM;
500 } 505 }
501 mod->percpu_size = size; 506 mod->percpu_size = pcpusec->sh_size;
502 return 0; 507 return 0;
503} 508}
504 509
@@ -563,10 +568,12 @@ static inline void __percpu *mod_percpu(struct module *mod)
563{ 568{
564 return NULL; 569 return NULL;
565} 570}
566static inline int percpu_modalloc(struct module *mod, 571static int percpu_modalloc(struct module *mod, struct load_info *info)
567 unsigned long size, unsigned long align)
568{ 572{
569 return -ENOMEM; 573 /* UP modules shouldn't have this section: ENOMEM isn't quite right */
574 if (info->sechdrs[info->index.pcpu].sh_size != 0)
575 return -ENOMEM;
576 return 0;
570} 577}
571static inline void percpu_modfree(struct module *mod) 578static inline void percpu_modfree(struct module *mod)
572{ 579{
@@ -2927,7 +2934,6 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
2927{ 2934{
2928 /* Module within temporary copy. */ 2935 /* Module within temporary copy. */
2929 struct module *mod; 2936 struct module *mod;
2930 Elf_Shdr *pcpusec;
2931 int err; 2937 int err;
2932 2938
2933 mod = setup_load_info(info, flags); 2939 mod = setup_load_info(info, flags);
@@ -2942,17 +2948,10 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
2942 err = module_frob_arch_sections(info->hdr, info->sechdrs, 2948 err = module_frob_arch_sections(info->hdr, info->sechdrs,
2943 info->secstrings, mod); 2949 info->secstrings, mod);
2944 if (err < 0) 2950 if (err < 0)
2945 goto out; 2951 return ERR_PTR(err);
2946 2952
2947 pcpusec = &info->sechdrs[info->index.pcpu]; 2953 /* We will do a special allocation for per-cpu sections later. */
2948 if (pcpusec->sh_size) { 2954 info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
2949 /* We have a special allocation for this section. */
2950 err = percpu_modalloc(mod,
2951 pcpusec->sh_size, pcpusec->sh_addralign);
2952 if (err)
2953 goto out;
2954 pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC;
2955 }
2956 2955
2957 /* Determine total sizes, and put offsets in sh_entsize. For now 2956 /* Determine total sizes, and put offsets in sh_entsize. For now
2958 this is done generically; there doesn't appear to be any 2957 this is done generically; there doesn't appear to be any
@@ -2963,17 +2962,12 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
2963 /* Allocate and move to the final place */ 2962 /* Allocate and move to the final place */
2964 err = move_module(mod, info); 2963 err = move_module(mod, info);
2965 if (err) 2964 if (err)
2966 goto free_percpu; 2965 return ERR_PTR(err);
2967 2966
2968 /* Module has been copied to its final place now: return it. */ 2967 /* Module has been copied to its final place now: return it. */
2969 mod = (void *)info->sechdrs[info->index.mod].sh_addr; 2968 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2970 kmemleak_load_module(mod, info); 2969 kmemleak_load_module(mod, info);
2971 return mod; 2970 return mod;
2972
2973free_percpu:
2974 percpu_modfree(mod);
2975out:
2976 return ERR_PTR(err);
2977} 2971}
2978 2972
2979/* mod is no longer valid after this! */ 2973/* mod is no longer valid after this! */
@@ -3014,7 +3008,7 @@ static bool finished_loading(const char *name)
3014 bool ret; 3008 bool ret;
3015 3009
3016 mutex_lock(&module_mutex); 3010 mutex_lock(&module_mutex);
3017 mod = find_module_all(name, true); 3011 mod = find_module_all(name, strlen(name), true);
3018 ret = !mod || mod->state == MODULE_STATE_LIVE 3012 ret = !mod || mod->state == MODULE_STATE_LIVE
3019 || mod->state == MODULE_STATE_GOING; 3013 || mod->state == MODULE_STATE_GOING;
3020 mutex_unlock(&module_mutex); 3014 mutex_unlock(&module_mutex);
@@ -3152,7 +3146,8 @@ static int add_unformed_module(struct module *mod)
3152 3146
3153again: 3147again:
3154 mutex_lock(&module_mutex); 3148 mutex_lock(&module_mutex);
3155 if ((old = find_module_all(mod->name, true)) != NULL) { 3149 old = find_module_all(mod->name, strlen(mod->name), true);
3150 if (old != NULL) {
3156 if (old->state == MODULE_STATE_COMING 3151 if (old->state == MODULE_STATE_COMING
3157 || old->state == MODULE_STATE_UNFORMED) { 3152 || old->state == MODULE_STATE_UNFORMED) {
3158 /* Wait in case it fails to load. */ 3153 /* Wait in case it fails to load. */
@@ -3198,6 +3193,17 @@ out:
3198 return err; 3193 return err;
3199} 3194}
3200 3195
3196static int unknown_module_param_cb(char *param, char *val, const char *modname)
3197{
3198 /* Check for magic 'dyndbg' arg */
3199 int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
3200 if (ret != 0) {
3201 printk(KERN_WARNING "%s: unknown parameter '%s' ignored\n",
3202 modname, param);
3203 }
3204 return 0;
3205}
3206
3201/* Allocate and load the module: note that size of section 0 is always 3207/* Allocate and load the module: note that size of section 0 is always
3202 zero, and we rely on this for optional sections. */ 3208 zero, and we rely on this for optional sections. */
3203static int load_module(struct load_info *info, const char __user *uargs, 3209static int load_module(struct load_info *info, const char __user *uargs,
@@ -3237,6 +3243,11 @@ static int load_module(struct load_info *info, const char __user *uargs,
3237 } 3243 }
3238#endif 3244#endif
3239 3245
3246 /* To avoid stressing percpu allocator, do this once we're unique. */
3247 err = percpu_modalloc(mod, info);
3248 if (err)
3249 goto unlink_mod;
3250
3240 /* Now module is in final location, initialize linked lists, etc. */ 3251 /* Now module is in final location, initialize linked lists, etc. */
3241 err = module_unload_init(mod); 3252 err = module_unload_init(mod);
3242 if (err) 3253 if (err)
@@ -3284,7 +3295,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
3284 3295
3285 /* Module is ready to execute: parsing args may do that. */ 3296 /* Module is ready to execute: parsing args may do that. */
3286 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, 3297 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
3287 -32768, 32767, &ddebug_dyndbg_module_param_cb); 3298 -32768, 32767, unknown_module_param_cb);
3288 if (err < 0) 3299 if (err < 0)
3289 goto bug_cleanup; 3300 goto bug_cleanup;
3290 3301
@@ -3563,10 +3574,8 @@ unsigned long module_kallsyms_lookup_name(const char *name)
3563 /* Don't lock: we're in enough trouble already. */ 3574 /* Don't lock: we're in enough trouble already. */
3564 preempt_disable(); 3575 preempt_disable();
3565 if ((colon = strchr(name, ':')) != NULL) { 3576 if ((colon = strchr(name, ':')) != NULL) {
3566 *colon = '\0'; 3577 if ((mod = find_module_all(name, colon - name, false)) != NULL)
3567 if ((mod = find_module(name)) != NULL)
3568 ret = mod_find_symname(mod, colon+1); 3578 ret = mod_find_symname(mod, colon+1);
3569 *colon = ':';
3570 } else { 3579 } else {
3571 list_for_each_entry_rcu(mod, &modules, list) { 3580 list_for_each_entry_rcu(mod, &modules, list) {
3572 if (mod->state == MODULE_STATE_UNFORMED) 3581 if (mod->state == MODULE_STATE_UNFORMED)
diff --git a/kernel/panic.c b/kernel/panic.c
index 167ec097ce8b..801864600514 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -15,6 +15,7 @@
15#include <linux/notifier.h> 15#include <linux/notifier.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/random.h> 17#include <linux/random.h>
18#include <linux/ftrace.h>
18#include <linux/reboot.h> 19#include <linux/reboot.h>
19#include <linux/delay.h> 20#include <linux/delay.h>
20#include <linux/kexec.h> 21#include <linux/kexec.h>
@@ -399,8 +400,11 @@ struct slowpath_args {
399static void warn_slowpath_common(const char *file, int line, void *caller, 400static void warn_slowpath_common(const char *file, int line, void *caller,
400 unsigned taint, struct slowpath_args *args) 401 unsigned taint, struct slowpath_args *args)
401{ 402{
402 printk(KERN_WARNING "------------[ cut here ]------------\n"); 403 disable_trace_on_warning();
403 printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); 404
405 pr_warn("------------[ cut here ]------------\n");
406 pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n",
407 raw_smp_processor_id(), current->pid, file, line, caller);
404 408
405 if (args) 409 if (args)
406 vprintk(args->fmt, args->args); 410 vprintk(args->fmt, args->args);
diff --git a/kernel/params.c b/kernel/params.c
index 53b958fcd639..440e65d1a544 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -787,7 +787,7 @@ static void __init kernel_add_sysfs_param(const char *name,
787} 787}
788 788
789/* 789/*
790 * param_sysfs_builtin - add contents in /sys/parameters for built-in modules 790 * param_sysfs_builtin - add sysfs parameters for built-in modules
791 * 791 *
792 * Add module_parameters to sysfs for "modules" built into the kernel. 792 * Add module_parameters to sysfs for "modules" built into the kernel.
793 * 793 *
diff --git a/kernel/pid.c b/kernel/pid.c
index 0db3e791a06d..66505c1dfc51 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -75,6 +75,7 @@ struct pid_namespace init_pid_ns = {
75 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } 75 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
76 }, 76 },
77 .last_pid = 0, 77 .last_pid = 0,
78 .nr_hashed = PIDNS_HASH_ADDING,
78 .level = 0, 79 .level = 0,
79 .child_reaper = &init_task, 80 .child_reaper = &init_task,
80 .user_ns = &init_user_ns, 81 .user_ns = &init_user_ns,
@@ -373,14 +374,10 @@ EXPORT_SYMBOL_GPL(find_vpid);
373/* 374/*
374 * attach_pid() must be called with the tasklist_lock write-held. 375 * attach_pid() must be called with the tasklist_lock write-held.
375 */ 376 */
376void attach_pid(struct task_struct *task, enum pid_type type, 377void attach_pid(struct task_struct *task, enum pid_type type)
377 struct pid *pid)
378{ 378{
379 struct pid_link *link; 379 struct pid_link *link = &task->pids[type];
380 380 hlist_add_head_rcu(&link->node, &link->pid->tasks[type]);
381 link = &task->pids[type];
382 link->pid = pid;
383 hlist_add_head_rcu(&link->node, &pid->tasks[type]);
384} 381}
385 382
386static void __change_pid(struct task_struct *task, enum pid_type type, 383static void __change_pid(struct task_struct *task, enum pid_type type,
@@ -412,7 +409,7 @@ void change_pid(struct task_struct *task, enum pid_type type,
412 struct pid *pid) 409 struct pid *pid)
413{ 410{
414 __change_pid(task, type, pid); 411 __change_pid(task, type, pid);
415 attach_pid(task, type, pid); 412 attach_pid(task, type);
416} 413}
417 414
418/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ 415/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
@@ -594,7 +591,6 @@ void __init pidmap_init(void)
594 /* Reserve PID 0. We never call free_pidmap(0) */ 591 /* Reserve PID 0. We never call free_pidmap(0) */
595 set_bit(0, init_pid_ns.pidmap[0].page); 592 set_bit(0, init_pid_ns.pidmap[0].page);
596 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 593 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
597 init_pid_ns.nr_hashed = PIDNS_HASH_ADDING;
598 594
599 init_pid_ns.pid_cachep = KMEM_CACHE(pid, 595 init_pid_ns.pid_cachep = KMEM_CACHE(pid,
600 SLAB_HWCACHE_ALIGN | SLAB_PANIC); 596 SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 42670e9b44e0..c7f31aa272f7 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -51,59 +51,28 @@ static int check_clock(const clockid_t which_clock)
51 return error; 51 return error;
52} 52}
53 53
54static inline union cpu_time_count 54static inline unsigned long long
55timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) 55timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
56{ 56{
57 union cpu_time_count ret; 57 unsigned long long ret;
58 ret.sched = 0; /* high half always zero when .cpu used */ 58
59 ret = 0; /* high half always zero when .cpu used */
59 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 60 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
60 ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; 61 ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
61 } else { 62 } else {
62 ret.cpu = timespec_to_cputime(tp); 63 ret = cputime_to_expires(timespec_to_cputime(tp));
63 } 64 }
64 return ret; 65 return ret;
65} 66}
66 67
67static void sample_to_timespec(const clockid_t which_clock, 68static void sample_to_timespec(const clockid_t which_clock,
68 union cpu_time_count cpu, 69 unsigned long long expires,
69 struct timespec *tp) 70 struct timespec *tp)
70{ 71{
71 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) 72 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
72 *tp = ns_to_timespec(cpu.sched); 73 *tp = ns_to_timespec(expires);
73 else 74 else
74 cputime_to_timespec(cpu.cpu, tp); 75 cputime_to_timespec((__force cputime_t)expires, tp);
75}
76
77static inline int cpu_time_before(const clockid_t which_clock,
78 union cpu_time_count now,
79 union cpu_time_count then)
80{
81 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
82 return now.sched < then.sched;
83 } else {
84 return now.cpu < then.cpu;
85 }
86}
87static inline void cpu_time_add(const clockid_t which_clock,
88 union cpu_time_count *acc,
89 union cpu_time_count val)
90{
91 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
92 acc->sched += val.sched;
93 } else {
94 acc->cpu += val.cpu;
95 }
96}
97static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
98 union cpu_time_count a,
99 union cpu_time_count b)
100{
101 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
102 a.sched -= b.sched;
103 } else {
104 a.cpu -= b.cpu;
105 }
106 return a;
107} 76}
108 77
109/* 78/*
@@ -111,47 +80,31 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
111 * given the current clock sample. 80 * given the current clock sample.
112 */ 81 */
113static void bump_cpu_timer(struct k_itimer *timer, 82static void bump_cpu_timer(struct k_itimer *timer,
114 union cpu_time_count now) 83 unsigned long long now)
115{ 84{
116 int i; 85 int i;
86 unsigned long long delta, incr;
117 87
118 if (timer->it.cpu.incr.sched == 0) 88 if (timer->it.cpu.incr == 0)
119 return; 89 return;
120 90
121 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { 91 if (now < timer->it.cpu.expires)
122 unsigned long long delta, incr; 92 return;
123 93
124 if (now.sched < timer->it.cpu.expires.sched) 94 incr = timer->it.cpu.incr;
125 return; 95 delta = now + incr - timer->it.cpu.expires;
126 incr = timer->it.cpu.incr.sched;
127 delta = now.sched + incr - timer->it.cpu.expires.sched;
128 /* Don't use (incr*2 < delta), incr*2 might overflow. */
129 for (i = 0; incr < delta - incr; i++)
130 incr = incr << 1;
131 for (; i >= 0; incr >>= 1, i--) {
132 if (delta < incr)
133 continue;
134 timer->it.cpu.expires.sched += incr;
135 timer->it_overrun += 1 << i;
136 delta -= incr;
137 }
138 } else {
139 cputime_t delta, incr;
140 96
141 if (now.cpu < timer->it.cpu.expires.cpu) 97 /* Don't use (incr*2 < delta), incr*2 might overflow. */
142 return; 98 for (i = 0; incr < delta - incr; i++)
143 incr = timer->it.cpu.incr.cpu; 99 incr = incr << 1;
144 delta = now.cpu + incr - timer->it.cpu.expires.cpu; 100
145 /* Don't use (incr*2 < delta), incr*2 might overflow. */ 101 for (; i >= 0; incr >>= 1, i--) {
146 for (i = 0; incr < delta - incr; i++) 102 if (delta < incr)
147 incr += incr; 103 continue;
148 for (; i >= 0; incr = incr >> 1, i--) { 104
149 if (delta < incr) 105 timer->it.cpu.expires += incr;
150 continue; 106 timer->it_overrun += 1 << i;
151 timer->it.cpu.expires.cpu += incr; 107 delta -= incr;
152 timer->it_overrun += 1 << i;
153 delta -= incr;
154 }
155 } 108 }
156} 109}
157 110
@@ -170,21 +123,21 @@ static inline int task_cputime_zero(const struct task_cputime *cputime)
170 return 0; 123 return 0;
171} 124}
172 125
173static inline cputime_t prof_ticks(struct task_struct *p) 126static inline unsigned long long prof_ticks(struct task_struct *p)
174{ 127{
175 cputime_t utime, stime; 128 cputime_t utime, stime;
176 129
177 task_cputime(p, &utime, &stime); 130 task_cputime(p, &utime, &stime);
178 131
179 return utime + stime; 132 return cputime_to_expires(utime + stime);
180} 133}
181static inline cputime_t virt_ticks(struct task_struct *p) 134static inline unsigned long long virt_ticks(struct task_struct *p)
182{ 135{
183 cputime_t utime; 136 cputime_t utime;
184 137
185 task_cputime(p, &utime, NULL); 138 task_cputime(p, &utime, NULL);
186 139
187 return utime; 140 return cputime_to_expires(utime);
188} 141}
189 142
190static int 143static int
@@ -225,19 +178,19 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
225 * Sample a per-thread clock for the given task. 178 * Sample a per-thread clock for the given task.
226 */ 179 */
227static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, 180static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
228 union cpu_time_count *cpu) 181 unsigned long long *sample)
229{ 182{
230 switch (CPUCLOCK_WHICH(which_clock)) { 183 switch (CPUCLOCK_WHICH(which_clock)) {
231 default: 184 default:
232 return -EINVAL; 185 return -EINVAL;
233 case CPUCLOCK_PROF: 186 case CPUCLOCK_PROF:
234 cpu->cpu = prof_ticks(p); 187 *sample = prof_ticks(p);
235 break; 188 break;
236 case CPUCLOCK_VIRT: 189 case CPUCLOCK_VIRT:
237 cpu->cpu = virt_ticks(p); 190 *sample = virt_ticks(p);
238 break; 191 break;
239 case CPUCLOCK_SCHED: 192 case CPUCLOCK_SCHED:
240 cpu->sched = task_sched_runtime(p); 193 *sample = task_sched_runtime(p);
241 break; 194 break;
242 } 195 }
243 return 0; 196 return 0;
@@ -284,7 +237,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
284 */ 237 */
285static int cpu_clock_sample_group(const clockid_t which_clock, 238static int cpu_clock_sample_group(const clockid_t which_clock,
286 struct task_struct *p, 239 struct task_struct *p,
287 union cpu_time_count *cpu) 240 unsigned long long *sample)
288{ 241{
289 struct task_cputime cputime; 242 struct task_cputime cputime;
290 243
@@ -293,15 +246,15 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
293 return -EINVAL; 246 return -EINVAL;
294 case CPUCLOCK_PROF: 247 case CPUCLOCK_PROF:
295 thread_group_cputime(p, &cputime); 248 thread_group_cputime(p, &cputime);
296 cpu->cpu = cputime.utime + cputime.stime; 249 *sample = cputime_to_expires(cputime.utime + cputime.stime);
297 break; 250 break;
298 case CPUCLOCK_VIRT: 251 case CPUCLOCK_VIRT:
299 thread_group_cputime(p, &cputime); 252 thread_group_cputime(p, &cputime);
300 cpu->cpu = cputime.utime; 253 *sample = cputime_to_expires(cputime.utime);
301 break; 254 break;
302 case CPUCLOCK_SCHED: 255 case CPUCLOCK_SCHED:
303 thread_group_cputime(p, &cputime); 256 thread_group_cputime(p, &cputime);
304 cpu->sched = cputime.sum_exec_runtime; 257 *sample = cputime.sum_exec_runtime;
305 break; 258 break;
306 } 259 }
307 return 0; 260 return 0;
@@ -312,7 +265,7 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
312{ 265{
313 const pid_t pid = CPUCLOCK_PID(which_clock); 266 const pid_t pid = CPUCLOCK_PID(which_clock);
314 int error = -EINVAL; 267 int error = -EINVAL;
315 union cpu_time_count rtn; 268 unsigned long long rtn;
316 269
317 if (pid == 0) { 270 if (pid == 0) {
318 /* 271 /*
@@ -446,6 +399,15 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
446 return ret; 399 return ret;
447} 400}
448 401
402static void cleanup_timers_list(struct list_head *head,
403 unsigned long long curr)
404{
405 struct cpu_timer_list *timer, *next;
406
407 list_for_each_entry_safe(timer, next, head, entry)
408 list_del_init(&timer->entry);
409}
410
449/* 411/*
450 * Clean out CPU timers still ticking when a thread exited. The task 412 * Clean out CPU timers still ticking when a thread exited. The task
451 * pointer is cleared, and the expiry time is replaced with the residual 413 * pointer is cleared, and the expiry time is replaced with the residual
@@ -456,37 +418,12 @@ static void cleanup_timers(struct list_head *head,
456 cputime_t utime, cputime_t stime, 418 cputime_t utime, cputime_t stime,
457 unsigned long long sum_exec_runtime) 419 unsigned long long sum_exec_runtime)
458{ 420{
459 struct cpu_timer_list *timer, *next;
460 cputime_t ptime = utime + stime;
461
462 list_for_each_entry_safe(timer, next, head, entry) {
463 list_del_init(&timer->entry);
464 if (timer->expires.cpu < ptime) {
465 timer->expires.cpu = 0;
466 } else {
467 timer->expires.cpu -= ptime;
468 }
469 }
470 421
471 ++head; 422 cputime_t ptime = utime + stime;
472 list_for_each_entry_safe(timer, next, head, entry) {
473 list_del_init(&timer->entry);
474 if (timer->expires.cpu < utime) {
475 timer->expires.cpu = 0;
476 } else {
477 timer->expires.cpu -= utime;
478 }
479 }
480 423
481 ++head; 424 cleanup_timers_list(head, cputime_to_expires(ptime));
482 list_for_each_entry_safe(timer, next, head, entry) { 425 cleanup_timers_list(++head, cputime_to_expires(utime));
483 list_del_init(&timer->entry); 426 cleanup_timers_list(++head, sum_exec_runtime);
484 if (timer->expires.sched < sum_exec_runtime) {
485 timer->expires.sched = 0;
486 } else {
487 timer->expires.sched -= sum_exec_runtime;
488 }
489 }
490} 427}
491 428
492/* 429/*
@@ -516,17 +453,21 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
516 tsk->se.sum_exec_runtime + sig->sum_sched_runtime); 453 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
517} 454}
518 455
519static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) 456static void clear_dead_task(struct k_itimer *itimer, unsigned long long now)
520{ 457{
458 struct cpu_timer_list *timer = &itimer->it.cpu;
459
521 /* 460 /*
522 * That's all for this thread or process. 461 * That's all for this thread or process.
523 * We leave our residual in expires to be reported. 462 * We leave our residual in expires to be reported.
524 */ 463 */
525 put_task_struct(timer->it.cpu.task); 464 put_task_struct(timer->task);
526 timer->it.cpu.task = NULL; 465 timer->task = NULL;
527 timer->it.cpu.expires = cpu_time_sub(timer->it_clock, 466 if (timer->expires < now) {
528 timer->it.cpu.expires, 467 timer->expires = 0;
529 now); 468 } else {
469 timer->expires -= now;
470 }
530} 471}
531 472
532static inline int expires_gt(cputime_t expires, cputime_t new_exp) 473static inline int expires_gt(cputime_t expires, cputime_t new_exp)
@@ -558,14 +499,14 @@ static void arm_timer(struct k_itimer *timer)
558 499
559 listpos = head; 500 listpos = head;
560 list_for_each_entry(next, head, entry) { 501 list_for_each_entry(next, head, entry) {
561 if (cpu_time_before(timer->it_clock, nt->expires, next->expires)) 502 if (nt->expires < next->expires)
562 break; 503 break;
563 listpos = &next->entry; 504 listpos = &next->entry;
564 } 505 }
565 list_add(&nt->entry, listpos); 506 list_add(&nt->entry, listpos);
566 507
567 if (listpos == head) { 508 if (listpos == head) {
568 union cpu_time_count *exp = &nt->expires; 509 unsigned long long exp = nt->expires;
569 510
570 /* 511 /*
571 * We are the new earliest-expiring POSIX 1.b timer, hence 512 * We are the new earliest-expiring POSIX 1.b timer, hence
@@ -576,17 +517,17 @@ static void arm_timer(struct k_itimer *timer)
576 517
577 switch (CPUCLOCK_WHICH(timer->it_clock)) { 518 switch (CPUCLOCK_WHICH(timer->it_clock)) {
578 case CPUCLOCK_PROF: 519 case CPUCLOCK_PROF:
579 if (expires_gt(cputime_expires->prof_exp, exp->cpu)) 520 if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp)))
580 cputime_expires->prof_exp = exp->cpu; 521 cputime_expires->prof_exp = expires_to_cputime(exp);
581 break; 522 break;
582 case CPUCLOCK_VIRT: 523 case CPUCLOCK_VIRT:
583 if (expires_gt(cputime_expires->virt_exp, exp->cpu)) 524 if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp)))
584 cputime_expires->virt_exp = exp->cpu; 525 cputime_expires->virt_exp = expires_to_cputime(exp);
585 break; 526 break;
586 case CPUCLOCK_SCHED: 527 case CPUCLOCK_SCHED:
587 if (cputime_expires->sched_exp == 0 || 528 if (cputime_expires->sched_exp == 0 ||
588 cputime_expires->sched_exp > exp->sched) 529 cputime_expires->sched_exp > exp)
589 cputime_expires->sched_exp = exp->sched; 530 cputime_expires->sched_exp = exp;
590 break; 531 break;
591 } 532 }
592 } 533 }
@@ -601,20 +542,20 @@ static void cpu_timer_fire(struct k_itimer *timer)
601 /* 542 /*
602 * User don't want any signal. 543 * User don't want any signal.
603 */ 544 */
604 timer->it.cpu.expires.sched = 0; 545 timer->it.cpu.expires = 0;
605 } else if (unlikely(timer->sigq == NULL)) { 546 } else if (unlikely(timer->sigq == NULL)) {
606 /* 547 /*
607 * This a special case for clock_nanosleep, 548 * This a special case for clock_nanosleep,
608 * not a normal timer from sys_timer_create. 549 * not a normal timer from sys_timer_create.
609 */ 550 */
610 wake_up_process(timer->it_process); 551 wake_up_process(timer->it_process);
611 timer->it.cpu.expires.sched = 0; 552 timer->it.cpu.expires = 0;
612 } else if (timer->it.cpu.incr.sched == 0) { 553 } else if (timer->it.cpu.incr == 0) {
613 /* 554 /*
614 * One-shot timer. Clear it as soon as it's fired. 555 * One-shot timer. Clear it as soon as it's fired.
615 */ 556 */
616 posix_timer_event(timer, 0); 557 posix_timer_event(timer, 0);
617 timer->it.cpu.expires.sched = 0; 558 timer->it.cpu.expires = 0;
618 } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { 559 } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
619 /* 560 /*
620 * The signal did not get queued because the signal 561 * The signal did not get queued because the signal
@@ -632,7 +573,7 @@ static void cpu_timer_fire(struct k_itimer *timer)
632 */ 573 */
633static int cpu_timer_sample_group(const clockid_t which_clock, 574static int cpu_timer_sample_group(const clockid_t which_clock,
634 struct task_struct *p, 575 struct task_struct *p,
635 union cpu_time_count *cpu) 576 unsigned long long *sample)
636{ 577{
637 struct task_cputime cputime; 578 struct task_cputime cputime;
638 579
@@ -641,13 +582,13 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
641 default: 582 default:
642 return -EINVAL; 583 return -EINVAL;
643 case CPUCLOCK_PROF: 584 case CPUCLOCK_PROF:
644 cpu->cpu = cputime.utime + cputime.stime; 585 *sample = cputime_to_expires(cputime.utime + cputime.stime);
645 break; 586 break;
646 case CPUCLOCK_VIRT: 587 case CPUCLOCK_VIRT:
647 cpu->cpu = cputime.utime; 588 *sample = cputime_to_expires(cputime.utime);
648 break; 589 break;
649 case CPUCLOCK_SCHED: 590 case CPUCLOCK_SCHED:
650 cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); 591 *sample = cputime.sum_exec_runtime + task_delta_exec(p);
651 break; 592 break;
652 } 593 }
653 return 0; 594 return 0;
@@ -694,7 +635,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
694 struct itimerspec *new, struct itimerspec *old) 635 struct itimerspec *new, struct itimerspec *old)
695{ 636{
696 struct task_struct *p = timer->it.cpu.task; 637 struct task_struct *p = timer->it.cpu.task;
697 union cpu_time_count old_expires, new_expires, old_incr, val; 638 unsigned long long old_expires, new_expires, old_incr, val;
698 int ret; 639 int ret;
699 640
700 if (unlikely(p == NULL)) { 641 if (unlikely(p == NULL)) {
@@ -749,7 +690,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
749 } 690 }
750 691
751 if (old) { 692 if (old) {
752 if (old_expires.sched == 0) { 693 if (old_expires == 0) {
753 old->it_value.tv_sec = 0; 694 old->it_value.tv_sec = 0;
754 old->it_value.tv_nsec = 0; 695 old->it_value.tv_nsec = 0;
755 } else { 696 } else {
@@ -764,11 +705,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
764 * new setting. 705 * new setting.
765 */ 706 */
766 bump_cpu_timer(timer, val); 707 bump_cpu_timer(timer, val);
767 if (cpu_time_before(timer->it_clock, val, 708 if (val < timer->it.cpu.expires) {
768 timer->it.cpu.expires)) { 709 old_expires = timer->it.cpu.expires - val;
769 old_expires = cpu_time_sub(
770 timer->it_clock,
771 timer->it.cpu.expires, val);
772 sample_to_timespec(timer->it_clock, 710 sample_to_timespec(timer->it_clock,
773 old_expires, 711 old_expires,
774 &old->it_value); 712 &old->it_value);
@@ -791,8 +729,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
791 goto out; 729 goto out;
792 } 730 }
793 731
794 if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) { 732 if (new_expires != 0 && !(flags & TIMER_ABSTIME)) {
795 cpu_time_add(timer->it_clock, &new_expires, val); 733 new_expires += val;
796 } 734 }
797 735
798 /* 736 /*
@@ -801,8 +739,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
801 * arm the timer (we'll just fake it for timer_gettime). 739 * arm the timer (we'll just fake it for timer_gettime).
802 */ 740 */
803 timer->it.cpu.expires = new_expires; 741 timer->it.cpu.expires = new_expires;
804 if (new_expires.sched != 0 && 742 if (new_expires != 0 && val < new_expires) {
805 cpu_time_before(timer->it_clock, val, new_expires)) {
806 arm_timer(timer); 743 arm_timer(timer);
807 } 744 }
808 745
@@ -826,8 +763,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
826 timer->it_overrun_last = 0; 763 timer->it_overrun_last = 0;
827 timer->it_overrun = -1; 764 timer->it_overrun = -1;
828 765
829 if (new_expires.sched != 0 && 766 if (new_expires != 0 && !(val < new_expires)) {
830 !cpu_time_before(timer->it_clock, val, new_expires)) {
831 /* 767 /*
832 * The designated time already passed, so we notify 768 * The designated time already passed, so we notify
833 * immediately, even if the thread never runs to 769 * immediately, even if the thread never runs to
@@ -849,7 +785,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
849 785
850static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) 786static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
851{ 787{
852 union cpu_time_count now; 788 unsigned long long now;
853 struct task_struct *p = timer->it.cpu.task; 789 struct task_struct *p = timer->it.cpu.task;
854 int clear_dead; 790 int clear_dead;
855 791
@@ -859,7 +795,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
859 sample_to_timespec(timer->it_clock, 795 sample_to_timespec(timer->it_clock,
860 timer->it.cpu.incr, &itp->it_interval); 796 timer->it.cpu.incr, &itp->it_interval);
861 797
862 if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */ 798 if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */
863 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; 799 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
864 return; 800 return;
865 } 801 }
@@ -891,7 +827,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
891 */ 827 */
892 put_task_struct(p); 828 put_task_struct(p);
893 timer->it.cpu.task = NULL; 829 timer->it.cpu.task = NULL;
894 timer->it.cpu.expires.sched = 0; 830 timer->it.cpu.expires = 0;
895 read_unlock(&tasklist_lock); 831 read_unlock(&tasklist_lock);
896 goto dead; 832 goto dead;
897 } else { 833 } else {
@@ -912,10 +848,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
912 goto dead; 848 goto dead;
913 } 849 }
914 850
915 if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) { 851 if (now < timer->it.cpu.expires) {
916 sample_to_timespec(timer->it_clock, 852 sample_to_timespec(timer->it_clock,
917 cpu_time_sub(timer->it_clock, 853 timer->it.cpu.expires - now,
918 timer->it.cpu.expires, now),
919 &itp->it_value); 854 &itp->it_value);
920 } else { 855 } else {
921 /* 856 /*
@@ -927,6 +862,28 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
927 } 862 }
928} 863}
929 864
865static unsigned long long
866check_timers_list(struct list_head *timers,
867 struct list_head *firing,
868 unsigned long long curr)
869{
870 int maxfire = 20;
871
872 while (!list_empty(timers)) {
873 struct cpu_timer_list *t;
874
875 t = list_first_entry(timers, struct cpu_timer_list, entry);
876
877 if (!--maxfire || curr < t->expires)
878 return t->expires;
879
880 t->firing = 1;
881 list_move_tail(&t->entry, firing);
882 }
883
884 return 0;
885}
886
930/* 887/*
931 * Check for any per-thread CPU timers that have fired and move them off 888 * Check for any per-thread CPU timers that have fired and move them off
932 * the tsk->cpu_timers[N] list onto the firing list. Here we update the 889 * the tsk->cpu_timers[N] list onto the firing list. Here we update the
@@ -935,54 +892,20 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
935static void check_thread_timers(struct task_struct *tsk, 892static void check_thread_timers(struct task_struct *tsk,
936 struct list_head *firing) 893 struct list_head *firing)
937{ 894{
938 int maxfire;
939 struct list_head *timers = tsk->cpu_timers; 895 struct list_head *timers = tsk->cpu_timers;
940 struct signal_struct *const sig = tsk->signal; 896 struct signal_struct *const sig = tsk->signal;
897 struct task_cputime *tsk_expires = &tsk->cputime_expires;
898 unsigned long long expires;
941 unsigned long soft; 899 unsigned long soft;
942 900
943 maxfire = 20; 901 expires = check_timers_list(timers, firing, prof_ticks(tsk));
944 tsk->cputime_expires.prof_exp = 0; 902 tsk_expires->prof_exp = expires_to_cputime(expires);
945 while (!list_empty(timers)) {
946 struct cpu_timer_list *t = list_first_entry(timers,
947 struct cpu_timer_list,
948 entry);
949 if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
950 tsk->cputime_expires.prof_exp = t->expires.cpu;
951 break;
952 }
953 t->firing = 1;
954 list_move_tail(&t->entry, firing);
955 }
956 903
957 ++timers; 904 expires = check_timers_list(++timers, firing, virt_ticks(tsk));
958 maxfire = 20; 905 tsk_expires->virt_exp = expires_to_cputime(expires);
959 tsk->cputime_expires.virt_exp = 0;
960 while (!list_empty(timers)) {
961 struct cpu_timer_list *t = list_first_entry(timers,
962 struct cpu_timer_list,
963 entry);
964 if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
965 tsk->cputime_expires.virt_exp = t->expires.cpu;
966 break;
967 }
968 t->firing = 1;
969 list_move_tail(&t->entry, firing);
970 }
971 906
972 ++timers; 907 tsk_expires->sched_exp = check_timers_list(++timers, firing,
973 maxfire = 20; 908 tsk->se.sum_exec_runtime);
974 tsk->cputime_expires.sched_exp = 0;
975 while (!list_empty(timers)) {
976 struct cpu_timer_list *t = list_first_entry(timers,
977 struct cpu_timer_list,
978 entry);
979 if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
980 tsk->cputime_expires.sched_exp = t->expires.sched;
981 break;
982 }
983 t->firing = 1;
984 list_move_tail(&t->entry, firing);
985 }
986 909
987 /* 910 /*
988 * Check for the special case thread timers. 911 * Check for the special case thread timers.
@@ -1030,7 +953,8 @@ static void stop_process_timers(struct signal_struct *sig)
1030static u32 onecputick; 953static u32 onecputick;
1031 954
1032static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, 955static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1033 cputime_t *expires, cputime_t cur_time, int signo) 956 unsigned long long *expires,
957 unsigned long long cur_time, int signo)
1034{ 958{
1035 if (!it->expires) 959 if (!it->expires)
1036 return; 960 return;
@@ -1066,9 +990,8 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1066static void check_process_timers(struct task_struct *tsk, 990static void check_process_timers(struct task_struct *tsk,
1067 struct list_head *firing) 991 struct list_head *firing)
1068{ 992{
1069 int maxfire;
1070 struct signal_struct *const sig = tsk->signal; 993 struct signal_struct *const sig = tsk->signal;
1071 cputime_t utime, ptime, virt_expires, prof_expires; 994 unsigned long long utime, ptime, virt_expires, prof_expires;
1072 unsigned long long sum_sched_runtime, sched_expires; 995 unsigned long long sum_sched_runtime, sched_expires;
1073 struct list_head *timers = sig->cpu_timers; 996 struct list_head *timers = sig->cpu_timers;
1074 struct task_cputime cputime; 997 struct task_cputime cputime;
@@ -1078,52 +1001,13 @@ static void check_process_timers(struct task_struct *tsk,
1078 * Collect the current process totals. 1001 * Collect the current process totals.
1079 */ 1002 */
1080 thread_group_cputimer(tsk, &cputime); 1003 thread_group_cputimer(tsk, &cputime);
1081 utime = cputime.utime; 1004 utime = cputime_to_expires(cputime.utime);
1082 ptime = utime + cputime.stime; 1005 ptime = utime + cputime_to_expires(cputime.stime);
1083 sum_sched_runtime = cputime.sum_exec_runtime; 1006 sum_sched_runtime = cputime.sum_exec_runtime;
1084 maxfire = 20;
1085 prof_expires = 0;
1086 while (!list_empty(timers)) {
1087 struct cpu_timer_list *tl = list_first_entry(timers,
1088 struct cpu_timer_list,
1089 entry);
1090 if (!--maxfire || ptime < tl->expires.cpu) {
1091 prof_expires = tl->expires.cpu;
1092 break;
1093 }
1094 tl->firing = 1;
1095 list_move_tail(&tl->entry, firing);
1096 }
1097 1007
1098 ++timers; 1008 prof_expires = check_timers_list(timers, firing, ptime);
1099 maxfire = 20; 1009 virt_expires = check_timers_list(++timers, firing, utime);
1100 virt_expires = 0; 1010 sched_expires = check_timers_list(++timers, firing, sum_sched_runtime);
1101 while (!list_empty(timers)) {
1102 struct cpu_timer_list *tl = list_first_entry(timers,
1103 struct cpu_timer_list,
1104 entry);
1105 if (!--maxfire || utime < tl->expires.cpu) {
1106 virt_expires = tl->expires.cpu;
1107 break;
1108 }
1109 tl->firing = 1;
1110 list_move_tail(&tl->entry, firing);
1111 }
1112
1113 ++timers;
1114 maxfire = 20;
1115 sched_expires = 0;
1116 while (!list_empty(timers)) {
1117 struct cpu_timer_list *tl = list_first_entry(timers,
1118 struct cpu_timer_list,
1119 entry);
1120 if (!--maxfire || sum_sched_runtime < tl->expires.sched) {
1121 sched_expires = tl->expires.sched;
1122 break;
1123 }
1124 tl->firing = 1;
1125 list_move_tail(&tl->entry, firing);
1126 }
1127 1011
1128 /* 1012 /*
1129 * Check for the special case process timers. 1013 * Check for the special case process timers.
@@ -1162,8 +1046,8 @@ static void check_process_timers(struct task_struct *tsk,
1162 } 1046 }
1163 } 1047 }
1164 1048
1165 sig->cputime_expires.prof_exp = prof_expires; 1049 sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires);
1166 sig->cputime_expires.virt_exp = virt_expires; 1050 sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires);
1167 sig->cputime_expires.sched_exp = sched_expires; 1051 sig->cputime_expires.sched_exp = sched_expires;
1168 if (task_cputime_zero(&sig->cputime_expires)) 1052 if (task_cputime_zero(&sig->cputime_expires))
1169 stop_process_timers(sig); 1053 stop_process_timers(sig);
@@ -1176,7 +1060,7 @@ static void check_process_timers(struct task_struct *tsk,
1176void posix_cpu_timer_schedule(struct k_itimer *timer) 1060void posix_cpu_timer_schedule(struct k_itimer *timer)
1177{ 1061{
1178 struct task_struct *p = timer->it.cpu.task; 1062 struct task_struct *p = timer->it.cpu.task;
1179 union cpu_time_count now; 1063 unsigned long long now;
1180 1064
1181 if (unlikely(p == NULL)) 1065 if (unlikely(p == NULL))
1182 /* 1066 /*
@@ -1205,7 +1089,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1205 */ 1089 */
1206 put_task_struct(p); 1090 put_task_struct(p);
1207 timer->it.cpu.task = p = NULL; 1091 timer->it.cpu.task = p = NULL;
1208 timer->it.cpu.expires.sched = 0; 1092 timer->it.cpu.expires = 0;
1209 goto out_unlock; 1093 goto out_unlock;
1210 } else if (unlikely(p->exit_state) && thread_group_empty(p)) { 1094 } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
1211 /* 1095 /*
@@ -1213,6 +1097,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1213 * not yet reaped. Take this opportunity to 1097 * not yet reaped. Take this opportunity to
1214 * drop our task ref. 1098 * drop our task ref.
1215 */ 1099 */
1100 cpu_timer_sample_group(timer->it_clock, p, &now);
1216 clear_dead_task(timer, now); 1101 clear_dead_task(timer, now);
1217 goto out_unlock; 1102 goto out_unlock;
1218 } 1103 }
@@ -1387,7 +1272,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1387void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, 1272void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1388 cputime_t *newval, cputime_t *oldval) 1273 cputime_t *newval, cputime_t *oldval)
1389{ 1274{
1390 union cpu_time_count now; 1275 unsigned long long now;
1391 1276
1392 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1277 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1393 cpu_timer_sample_group(clock_idx, tsk, &now); 1278 cpu_timer_sample_group(clock_idx, tsk, &now);
@@ -1399,17 +1284,17 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1399 * it to be absolute. 1284 * it to be absolute.
1400 */ 1285 */
1401 if (*oldval) { 1286 if (*oldval) {
1402 if (*oldval <= now.cpu) { 1287 if (*oldval <= now) {
1403 /* Just about to fire. */ 1288 /* Just about to fire. */
1404 *oldval = cputime_one_jiffy; 1289 *oldval = cputime_one_jiffy;
1405 } else { 1290 } else {
1406 *oldval -= now.cpu; 1291 *oldval -= now;
1407 } 1292 }
1408 } 1293 }
1409 1294
1410 if (!*newval) 1295 if (!*newval)
1411 goto out; 1296 goto out;
1412 *newval += now.cpu; 1297 *newval += now;
1413 } 1298 }
1414 1299
1415 /* 1300 /*
@@ -1459,7 +1344,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1459 } 1344 }
1460 1345
1461 while (!signal_pending(current)) { 1346 while (!signal_pending(current)) {
1462 if (timer.it.cpu.expires.sched == 0) { 1347 if (timer.it.cpu.expires == 0) {
1463 /* 1348 /*
1464 * Our timer fired and was reset, below 1349 * Our timer fired and was reset, below
1465 * deletion can not fail. 1350 * deletion can not fail.
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9c39de095ba9..d444c4e834f4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -262,6 +262,26 @@ config PM_GENERIC_DOMAINS
262 bool 262 bool
263 depends on PM 263 depends on PM
264 264
265config WQ_POWER_EFFICIENT_DEFAULT
266 bool "Enable workqueue power-efficient mode by default"
267 depends on PM
268 default n
269 help
270 Per-cpu workqueues are generally preferred because they show
271 better performance thanks to cache locality; unfortunately,
272 per-cpu workqueues tend to be more power hungry than unbound
273 workqueues.
274
275 Enabling workqueue.power_efficient kernel parameter makes the
276 per-cpu workqueues which were observed to contribute
277 significantly to power consumption unbound, leading to measurably
278 lower power usage at the cost of small performance overhead.
279
280 This config option determines whether workqueue.power_efficient
281 is enabled by default.
282
283 If in doubt, say N.
284
265config PM_GENERIC_DOMAINS_SLEEP 285config PM_GENERIC_DOMAINS_SLEEP
266 def_bool y 286 def_bool y
267 depends on PM_SLEEP && PM_GENERIC_DOMAINS 287 depends on PM_SLEEP && PM_GENERIC_DOMAINS
diff --git a/kernel/power/main.c b/kernel/power/main.c
index d77663bfedeb..1d1bf630e6e9 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -424,6 +424,8 @@ static ssize_t wakeup_count_store(struct kobject *kobj,
424 if (sscanf(buf, "%u", &val) == 1) { 424 if (sscanf(buf, "%u", &val) == 1) {
425 if (pm_save_wakeup_count(val)) 425 if (pm_save_wakeup_count(val))
426 error = n; 426 error = n;
427 else
428 pm_print_active_wakeup_sources();
427 } 429 }
428 430
429 out: 431 out:
@@ -528,6 +530,10 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
528 530
529 if (sscanf(buf, "%d", &val) == 1) { 531 if (sscanf(buf, "%d", &val) == 1) {
530 pm_trace_enabled = !!val; 532 pm_trace_enabled = !!val;
533 if (pm_trace_enabled) {
534 pr_warn("PM: Enabling pm_trace changes system date and time during resume.\n"
535 "PM: Correct system time has to be restored manually after resume.\n");
536 }
531 return n; 537 return n;
532 } 538 }
533 return -EINVAL; 539 return -EINVAL;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 98088e0e71e8..fc0df8486449 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -30,9 +30,10 @@ static int try_to_freeze_tasks(bool user_only)
30 unsigned int todo; 30 unsigned int todo;
31 bool wq_busy = false; 31 bool wq_busy = false;
32 struct timeval start, end; 32 struct timeval start, end;
33 u64 elapsed_csecs64; 33 u64 elapsed_msecs64;
34 unsigned int elapsed_csecs; 34 unsigned int elapsed_msecs;
35 bool wakeup = false; 35 bool wakeup = false;
36 int sleep_usecs = USEC_PER_MSEC;
36 37
37 do_gettimeofday(&start); 38 do_gettimeofday(&start);
38 39
@@ -68,22 +69,25 @@ static int try_to_freeze_tasks(bool user_only)
68 69
69 /* 70 /*
70 * We need to retry, but first give the freezing tasks some 71 * We need to retry, but first give the freezing tasks some
71 * time to enter the refrigerator. 72 * time to enter the refrigerator. Start with an initial
73 * 1 ms sleep followed by exponential backoff until 8 ms.
72 */ 74 */
73 msleep(10); 75 usleep_range(sleep_usecs / 2, sleep_usecs);
76 if (sleep_usecs < 8 * USEC_PER_MSEC)
77 sleep_usecs *= 2;
74 } 78 }
75 79
76 do_gettimeofday(&end); 80 do_gettimeofday(&end);
77 elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); 81 elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
78 do_div(elapsed_csecs64, NSEC_PER_SEC / 100); 82 do_div(elapsed_msecs64, NSEC_PER_MSEC);
79 elapsed_csecs = elapsed_csecs64; 83 elapsed_msecs = elapsed_msecs64;
80 84
81 if (todo) { 85 if (todo) {
82 printk("\n"); 86 printk("\n");
83 printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " 87 printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds "
84 "(%d tasks refusing to freeze, wq_busy=%d):\n", 88 "(%d tasks refusing to freeze, wq_busy=%d):\n",
85 wakeup ? "aborted" : "failed", 89 wakeup ? "aborted" : "failed",
86 elapsed_csecs / 100, elapsed_csecs % 100, 90 elapsed_msecs / 1000, elapsed_msecs % 1000,
87 todo - wq_busy, wq_busy); 91 todo - wq_busy, wq_busy);
88 92
89 if (!wakeup) { 93 if (!wakeup) {
@@ -96,8 +100,8 @@ static int try_to_freeze_tasks(bool user_only)
96 read_unlock(&tasklist_lock); 100 read_unlock(&tasklist_lock);
97 } 101 }
98 } else { 102 } else {
99 printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, 103 printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
100 elapsed_csecs % 100); 104 elapsed_msecs % 1000);
101 } 105 }
102 106
103 return todo ? -EBUSY : 0; 107 return todo ? -EBUSY : 0;
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 587dddeebf15..06fe28589e9c 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -44,6 +44,7 @@
44 44
45#include <linux/uaccess.h> 45#include <linux/uaccess.h>
46#include <linux/export.h> 46#include <linux/export.h>
47#include <trace/events/power.h>
47 48
48/* 49/*
49 * locking rule: all changes to constraints or notifiers lists 50 * locking rule: all changes to constraints or notifiers lists
@@ -202,6 +203,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
202 203
203 spin_unlock_irqrestore(&pm_qos_lock, flags); 204 spin_unlock_irqrestore(&pm_qos_lock, flags);
204 205
206 trace_pm_qos_update_target(action, prev_value, curr_value);
205 if (prev_value != curr_value) { 207 if (prev_value != curr_value) {
206 blocking_notifier_call_chain(c->notifiers, 208 blocking_notifier_call_chain(c->notifiers,
207 (unsigned long)curr_value, 209 (unsigned long)curr_value,
@@ -272,6 +274,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf,
272 274
273 spin_unlock_irqrestore(&pm_qos_lock, irqflags); 275 spin_unlock_irqrestore(&pm_qos_lock, irqflags);
274 276
277 trace_pm_qos_update_flags(action, prev_value, curr_value);
275 return prev_value != curr_value; 278 return prev_value != curr_value;
276} 279}
277 280
@@ -333,6 +336,7 @@ void pm_qos_add_request(struct pm_qos_request *req,
333 } 336 }
334 req->pm_qos_class = pm_qos_class; 337 req->pm_qos_class = pm_qos_class;
335 INIT_DELAYED_WORK(&req->work, pm_qos_work_fn); 338 INIT_DELAYED_WORK(&req->work, pm_qos_work_fn);
339 trace_pm_qos_add_request(pm_qos_class, value);
336 pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, 340 pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
337 &req->node, PM_QOS_ADD_REQ, value); 341 &req->node, PM_QOS_ADD_REQ, value);
338} 342}
@@ -361,6 +365,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
361 365
362 cancel_delayed_work_sync(&req->work); 366 cancel_delayed_work_sync(&req->work);
363 367
368 trace_pm_qos_update_request(req->pm_qos_class, new_value);
364 if (new_value != req->node.prio) 369 if (new_value != req->node.prio)
365 pm_qos_update_target( 370 pm_qos_update_target(
366 pm_qos_array[req->pm_qos_class]->constraints, 371 pm_qos_array[req->pm_qos_class]->constraints,
@@ -387,6 +392,8 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
387 392
388 cancel_delayed_work_sync(&req->work); 393 cancel_delayed_work_sync(&req->work);
389 394
395 trace_pm_qos_update_request_timeout(req->pm_qos_class,
396 new_value, timeout_us);
390 if (new_value != req->node.prio) 397 if (new_value != req->node.prio)
391 pm_qos_update_target( 398 pm_qos_update_target(
392 pm_qos_array[req->pm_qos_class]->constraints, 399 pm_qos_array[req->pm_qos_class]->constraints,
@@ -416,6 +423,7 @@ void pm_qos_remove_request(struct pm_qos_request *req)
416 423
417 cancel_delayed_work_sync(&req->work); 424 cancel_delayed_work_sync(&req->work);
418 425
426 trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE);
419 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, 427 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
420 &req->node, PM_QOS_REMOVE_REQ, 428 &req->node, PM_QOS_REMOVE_REQ,
421 PM_QOS_DEFAULT_VALUE); 429 PM_QOS_DEFAULT_VALUE);
@@ -477,7 +485,7 @@ static int find_pm_qos_object_by_minor(int minor)
477{ 485{
478 int pm_qos_class; 486 int pm_qos_class;
479 487
480 for (pm_qos_class = 0; 488 for (pm_qos_class = PM_QOS_CPU_DMA_LATENCY;
481 pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { 489 pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
482 if (minor == 490 if (minor ==
483 pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) 491 pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
@@ -491,7 +499,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
491 long pm_qos_class; 499 long pm_qos_class;
492 500
493 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 501 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
494 if (pm_qos_class >= 0) { 502 if (pm_qos_class >= PM_QOS_CPU_DMA_LATENCY) {
495 struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL); 503 struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL);
496 if (!req) 504 if (!req)
497 return -ENOMEM; 505 return -ENOMEM;
@@ -584,7 +592,7 @@ static int __init pm_qos_power_init(void)
584 592
585 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); 593 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
586 594
587 for (i = 1; i < PM_QOS_NUM_CLASSES; i++) { 595 for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
588 ret = register_pm_qos_misc(pm_qos_array[i]); 596 ret = register_pm_qos_misc(pm_qos_array[i]);
589 if (ret < 0) { 597 if (ret < 0) {
590 printk(KERN_ERR "pm_qos_param: %s setup failed\n", 598 printk(KERN_ERR "pm_qos_param: %s setup failed\n",
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0de28576807d..349587bb03e1 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -642,8 +642,9 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
642 region->end_pfn = end_pfn; 642 region->end_pfn = end_pfn;
643 list_add_tail(&region->list, &nosave_regions); 643 list_add_tail(&region->list, &nosave_regions);
644 Report: 644 Report:
645 printk(KERN_INFO "PM: Registered nosave memory: %016lx - %016lx\n", 645 printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n",
646 start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); 646 (unsigned long long) start_pfn << PAGE_SHIFT,
647 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
647} 648}
648 649
649/* 650/*
@@ -1651,7 +1652,7 @@ unsigned long snapshot_get_image_size(void)
1651static int init_header(struct swsusp_info *info) 1652static int init_header(struct swsusp_info *info)
1652{ 1653{
1653 memset(info, 0, sizeof(struct swsusp_info)); 1654 memset(info, 0, sizeof(struct swsusp_info));
1654 info->num_physpages = num_physpages; 1655 info->num_physpages = get_num_physpages();
1655 info->image_pages = nr_copy_pages; 1656 info->image_pages = nr_copy_pages;
1656 info->pages = snapshot_get_image_size(); 1657 info->pages = snapshot_get_image_size();
1657 info->size = info->pages; 1658 info->size = info->pages;
@@ -1795,7 +1796,7 @@ static int check_header(struct swsusp_info *info)
1795 char *reason; 1796 char *reason;
1796 1797
1797 reason = check_image_kernel(info); 1798 reason = check_image_kernel(info);
1798 if (!reason && info->num_physpages != num_physpages) 1799 if (!reason && info->num_physpages != get_num_physpages())
1799 reason = "memory size"; 1800 reason = "memory size";
1800 if (reason) { 1801 if (reason) {
1801 printk(KERN_ERR "PM: Image mismatch: %s\n", reason); 1802 printk(KERN_ERR "PM: Image mismatch: %s\n", reason);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index bef86d121eb2..ece04223bb1e 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -269,7 +269,7 @@ int suspend_devices_and_enter(suspend_state_t state)
269 suspend_test_start(); 269 suspend_test_start();
270 error = dpm_suspend_start(PMSG_SUSPEND); 270 error = dpm_suspend_start(PMSG_SUSPEND);
271 if (error) { 271 if (error) {
272 printk(KERN_ERR "PM: Some devices failed to suspend\n"); 272 pr_err("PM: Some devices failed to suspend, or early wake event detected\n");
273 goto Recover_platform; 273 goto Recover_platform;
274 } 274 }
275 suspend_test_finish("suspend devices"); 275 suspend_test_finish("suspend devices");
diff --git a/kernel/printk.c b/kernel/printk.c
index 8212c1aef125..d37d45c90ae6 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1369,9 +1369,9 @@ static int console_trylock_for_printk(unsigned int cpu)
1369 } 1369 }
1370 } 1370 }
1371 logbuf_cpu = UINT_MAX; 1371 logbuf_cpu = UINT_MAX;
1372 raw_spin_unlock(&logbuf_lock);
1372 if (wake) 1373 if (wake)
1373 up(&console_sem); 1374 up(&console_sem);
1374 raw_spin_unlock(&logbuf_lock);
1375 return retval; 1375 return retval;
1376} 1376}
1377 1377
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 335a7ae697f5..4041f5747e73 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -469,6 +469,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
469 /* Architecture-specific hardware disable .. */ 469 /* Architecture-specific hardware disable .. */
470 ptrace_disable(child); 470 ptrace_disable(child);
471 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); 471 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
472 flush_ptrace_hw_breakpoint(child);
472 473
473 write_lock_irq(&tasklist_lock); 474 write_lock_irq(&tasklist_lock);
474 /* 475 /*
@@ -844,6 +845,47 @@ int ptrace_request(struct task_struct *child, long request,
844 ret = ptrace_setsiginfo(child, &siginfo); 845 ret = ptrace_setsiginfo(child, &siginfo);
845 break; 846 break;
846 847
848 case PTRACE_GETSIGMASK:
849 if (addr != sizeof(sigset_t)) {
850 ret = -EINVAL;
851 break;
852 }
853
854 if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t)))
855 ret = -EFAULT;
856 else
857 ret = 0;
858
859 break;
860
861 case PTRACE_SETSIGMASK: {
862 sigset_t new_set;
863
864 if (addr != sizeof(sigset_t)) {
865 ret = -EINVAL;
866 break;
867 }
868
869 if (copy_from_user(&new_set, datavp, sizeof(sigset_t))) {
870 ret = -EFAULT;
871 break;
872 }
873
874 sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
875
876 /*
877 * Every thread does recalc_sigpending() after resume, so
878 * retarget_shared_pending() and recalc_sigpending() are not
879 * called here.
880 */
881 spin_lock_irq(&child->sighand->siglock);
882 child->blocked = new_set;
883 spin_unlock_irq(&child->sighand->siglock);
884
885 ret = 0;
886 break;
887 }
888
847 case PTRACE_INTERRUPT: 889 case PTRACE_INTERRUPT:
848 /* 890 /*
849 * Stop tracee without any side-effect on signal or job 891 * Stop tracee without any side-effect on signal or job
@@ -948,8 +990,7 @@ int ptrace_request(struct task_struct *child, long request,
948 990
949#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 991#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
950 case PTRACE_GETREGSET: 992 case PTRACE_GETREGSET:
951 case PTRACE_SETREGSET: 993 case PTRACE_SETREGSET: {
952 {
953 struct iovec kiov; 994 struct iovec kiov;
954 struct iovec __user *uiov = datavp; 995 struct iovec __user *uiov = datavp;
955 996
@@ -1181,19 +1222,3 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
1181 return ret; 1222 return ret;
1182} 1223}
1183#endif /* CONFIG_COMPAT */ 1224#endif /* CONFIG_COMPAT */
1184
1185#ifdef CONFIG_HAVE_HW_BREAKPOINT
1186int ptrace_get_breakpoints(struct task_struct *tsk)
1187{
1188 if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt))
1189 return 0;
1190
1191 return -1;
1192}
1193
1194void ptrace_put_breakpoints(struct task_struct *tsk)
1195{
1196 if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt))
1197 flush_ptrace_hw_breakpoint(tsk);
1198}
1199#endif /* CONFIG_HAVE_HW_BREAKPOINT */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index cf3adc6fe001..e08abb9461ac 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -3026,7 +3026,7 @@ static int __init rcu_spawn_gp_kthread(void)
3026 struct task_struct *t; 3026 struct task_struct *t;
3027 3027
3028 for_each_rcu_flavor(rsp) { 3028 for_each_rcu_flavor(rsp) {
3029 t = kthread_run(rcu_gp_kthread, rsp, rsp->name); 3029 t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
3030 BUG_ON(IS_ERR(t)); 3030 BUG_ON(IS_ERR(t));
3031 rnp = rcu_get_root(rsp); 3031 rnp = rcu_get_root(rsp);
3032 raw_spin_lock_irqsave(&rnp->lock, flags); 3032 raw_spin_lock_irqsave(&rnp->lock, flags);
diff --git a/kernel/reboot.c b/kernel/reboot.c
new file mode 100644
index 000000000000..269ed9384cc4
--- /dev/null
+++ b/kernel/reboot.c
@@ -0,0 +1,419 @@
1/*
2 * linux/kernel/reboot.c
3 *
4 * Copyright (C) 2013 Linus Torvalds
5 */
6
7#define pr_fmt(fmt) "reboot: " fmt
8
9#include <linux/ctype.h>
10#include <linux/export.h>
11#include <linux/kexec.h>
12#include <linux/kmod.h>
13#include <linux/kmsg_dump.h>
14#include <linux/reboot.h>
15#include <linux/suspend.h>
16#include <linux/syscalls.h>
17#include <linux/syscore_ops.h>
18#include <linux/uaccess.h>
19
20/*
21 * this indicates whether you can reboot with ctrl-alt-del: the default is yes
22 */
23
24int C_A_D = 1;
25struct pid *cad_pid;
26EXPORT_SYMBOL(cad_pid);
27
28#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32)
29#define DEFAULT_REBOOT_MODE = REBOOT_HARD
30#else
31#define DEFAULT_REBOOT_MODE
32#endif
33enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
34
35int reboot_default;
36int reboot_cpu;
37enum reboot_type reboot_type = BOOT_ACPI;
38int reboot_force;
39
40/*
41 * If set, this is used for preparing the system to power off.
42 */
43
44void (*pm_power_off_prepare)(void);
45
46/**
47 * emergency_restart - reboot the system
48 *
49 * Without shutting down any hardware or taking any locks
50 * reboot the system. This is called when we know we are in
51 * trouble so this is our best effort to reboot. This is
52 * safe to call in interrupt context.
53 */
54void emergency_restart(void)
55{
56 kmsg_dump(KMSG_DUMP_EMERG);
57 machine_emergency_restart();
58}
59EXPORT_SYMBOL_GPL(emergency_restart);
60
61void kernel_restart_prepare(char *cmd)
62{
63 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
64 system_state = SYSTEM_RESTART;
65 usermodehelper_disable();
66 device_shutdown();
67}
68
69/**
70 * register_reboot_notifier - Register function to be called at reboot time
71 * @nb: Info about notifier function to be called
72 *
73 * Registers a function with the list of functions
74 * to be called at reboot time.
75 *
76 * Currently always returns zero, as blocking_notifier_chain_register()
77 * always returns zero.
78 */
79int register_reboot_notifier(struct notifier_block *nb)
80{
81 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
82}
83EXPORT_SYMBOL(register_reboot_notifier);
84
85/**
86 * unregister_reboot_notifier - Unregister previously registered reboot notifier
87 * @nb: Hook to be unregistered
88 *
89 * Unregisters a previously registered reboot
90 * notifier function.
91 *
92 * Returns zero on success, or %-ENOENT on failure.
93 */
94int unregister_reboot_notifier(struct notifier_block *nb)
95{
96 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
97}
98EXPORT_SYMBOL(unregister_reboot_notifier);
99
100static void migrate_to_reboot_cpu(void)
101{
102 /* The boot cpu is always logical cpu 0 */
103 int cpu = reboot_cpu;
104
105 cpu_hotplug_disable();
106
107 /* Make certain the cpu I'm about to reboot on is online */
108 if (!cpu_online(cpu))
109 cpu = cpumask_first(cpu_online_mask);
110
111 /* Prevent races with other tasks migrating this task */
112 current->flags |= PF_NO_SETAFFINITY;
113
114 /* Make certain I only run on the appropriate processor */
115 set_cpus_allowed_ptr(current, cpumask_of(cpu));
116}
117
118/**
119 * kernel_restart - reboot the system
120 * @cmd: pointer to buffer containing command to execute for restart
121 * or %NULL
122 *
123 * Shutdown everything and perform a clean reboot.
124 * This is not safe to call in interrupt context.
125 */
126void kernel_restart(char *cmd)
127{
128 kernel_restart_prepare(cmd);
129 migrate_to_reboot_cpu();
130 syscore_shutdown();
131 if (!cmd)
132 pr_emerg("Restarting system\n");
133 else
134 pr_emerg("Restarting system with command '%s'\n", cmd);
135 kmsg_dump(KMSG_DUMP_RESTART);
136 machine_restart(cmd);
137}
138EXPORT_SYMBOL_GPL(kernel_restart);
139
140static void kernel_shutdown_prepare(enum system_states state)
141{
142 blocking_notifier_call_chain(&reboot_notifier_list,
143 (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL);
144 system_state = state;
145 usermodehelper_disable();
146 device_shutdown();
147}
148/**
149 * kernel_halt - halt the system
150 *
151 * Shutdown everything and perform a clean system halt.
152 */
153void kernel_halt(void)
154{
155 kernel_shutdown_prepare(SYSTEM_HALT);
156 migrate_to_reboot_cpu();
157 syscore_shutdown();
158 pr_emerg("System halted\n");
159 kmsg_dump(KMSG_DUMP_HALT);
160 machine_halt();
161}
162EXPORT_SYMBOL_GPL(kernel_halt);
163
164/**
165 * kernel_power_off - power_off the system
166 *
167 * Shutdown everything and perform a clean system power_off.
168 */
169void kernel_power_off(void)
170{
171 kernel_shutdown_prepare(SYSTEM_POWER_OFF);
172 if (pm_power_off_prepare)
173 pm_power_off_prepare();
174 migrate_to_reboot_cpu();
175 syscore_shutdown();
176 pr_emerg("Power down\n");
177 kmsg_dump(KMSG_DUMP_POWEROFF);
178 machine_power_off();
179}
180EXPORT_SYMBOL_GPL(kernel_power_off);
181
182static DEFINE_MUTEX(reboot_mutex);
183
184/*
185 * Reboot system call: for obvious reasons only root may call it,
186 * and even root needs to set up some magic numbers in the registers
187 * so that some mistake won't make this reboot the whole machine.
188 * You can also set the meaning of the ctrl-alt-del-key here.
189 *
190 * reboot doesn't sync: do that yourself before calling this.
191 */
192SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
193 void __user *, arg)
194{
195 struct pid_namespace *pid_ns = task_active_pid_ns(current);
196 char buffer[256];
197 int ret = 0;
198
199 /* We only trust the superuser with rebooting the system. */
200 if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
201 return -EPERM;
202
203 /* For safety, we require "magic" arguments. */
204 if (magic1 != LINUX_REBOOT_MAGIC1 ||
205 (magic2 != LINUX_REBOOT_MAGIC2 &&
206 magic2 != LINUX_REBOOT_MAGIC2A &&
207 magic2 != LINUX_REBOOT_MAGIC2B &&
208 magic2 != LINUX_REBOOT_MAGIC2C))
209 return -EINVAL;
210
211 /*
212 * If pid namespaces are enabled and the current task is in a child
213 * pid_namespace, the command is handled by reboot_pid_ns() which will
214 * call do_exit().
215 */
216 ret = reboot_pid_ns(pid_ns, cmd);
217 if (ret)
218 return ret;
219
220 /* Instead of trying to make the power_off code look like
221 * halt when pm_power_off is not set do it the easy way.
222 */
223 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
224 cmd = LINUX_REBOOT_CMD_HALT;
225
226 mutex_lock(&reboot_mutex);
227 switch (cmd) {
228 case LINUX_REBOOT_CMD_RESTART:
229 kernel_restart(NULL);
230 break;
231
232 case LINUX_REBOOT_CMD_CAD_ON:
233 C_A_D = 1;
234 break;
235
236 case LINUX_REBOOT_CMD_CAD_OFF:
237 C_A_D = 0;
238 break;
239
240 case LINUX_REBOOT_CMD_HALT:
241 kernel_halt();
242 do_exit(0);
243 panic("cannot halt");
244
245 case LINUX_REBOOT_CMD_POWER_OFF:
246 kernel_power_off();
247 do_exit(0);
248 break;
249
250 case LINUX_REBOOT_CMD_RESTART2:
251 ret = strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1);
252 if (ret < 0) {
253 ret = -EFAULT;
254 break;
255 }
256 buffer[sizeof(buffer) - 1] = '\0';
257
258 kernel_restart(buffer);
259 break;
260
261#ifdef CONFIG_KEXEC
262 case LINUX_REBOOT_CMD_KEXEC:
263 ret = kernel_kexec();
264 break;
265#endif
266
267#ifdef CONFIG_HIBERNATION
268 case LINUX_REBOOT_CMD_SW_SUSPEND:
269 ret = hibernate();
270 break;
271#endif
272
273 default:
274 ret = -EINVAL;
275 break;
276 }
277 mutex_unlock(&reboot_mutex);
278 return ret;
279}
280
281static void deferred_cad(struct work_struct *dummy)
282{
283 kernel_restart(NULL);
284}
285
286/*
287 * This function gets called by ctrl-alt-del - ie the keyboard interrupt.
288 * As it's called within an interrupt, it may NOT sync: the only choice
289 * is whether to reboot at once, or just ignore the ctrl-alt-del.
290 */
291void ctrl_alt_del(void)
292{
293 static DECLARE_WORK(cad_work, deferred_cad);
294
295 if (C_A_D)
296 schedule_work(&cad_work);
297 else
298 kill_cad_pid(SIGINT, 1);
299}
300
301char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
302
303static int __orderly_poweroff(bool force)
304{
305 char **argv;
306 static char *envp[] = {
307 "HOME=/",
308 "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
309 NULL
310 };
311 int ret;
312
313 argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
314 if (argv) {
315 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
316 argv_free(argv);
317 } else {
318 ret = -ENOMEM;
319 }
320
321 if (ret && force) {
322 pr_warn("Failed to start orderly shutdown: forcing the issue\n");
323 /*
324 * I guess this should try to kick off some daemon to sync and
325 * poweroff asap. Or not even bother syncing if we're doing an
326 * emergency shutdown?
327 */
328 emergency_sync();
329 kernel_power_off();
330 }
331
332 return ret;
333}
334
335static bool poweroff_force;
336
337static void poweroff_work_func(struct work_struct *work)
338{
339 __orderly_poweroff(poweroff_force);
340}
341
342static DECLARE_WORK(poweroff_work, poweroff_work_func);
343
344/**
345 * orderly_poweroff - Trigger an orderly system poweroff
346 * @force: force poweroff if command execution fails
347 *
348 * This may be called from any context to trigger a system shutdown.
349 * If the orderly shutdown fails, it will force an immediate shutdown.
350 */
351int orderly_poweroff(bool force)
352{
353 if (force) /* do not override the pending "true" */
354 poweroff_force = true;
355 schedule_work(&poweroff_work);
356 return 0;
357}
358EXPORT_SYMBOL_GPL(orderly_poweroff);
359
360static int __init reboot_setup(char *str)
361{
362 for (;;) {
363 /*
364 * Having anything passed on the command line via
365 * reboot= will cause us to disable DMI checking
366 * below.
367 */
368 reboot_default = 0;
369
370 switch (*str) {
371 case 'w':
372 reboot_mode = REBOOT_WARM;
373 break;
374
375 case 'c':
376 reboot_mode = REBOOT_COLD;
377 break;
378
379 case 'h':
380 reboot_mode = REBOOT_HARD;
381 break;
382
383 case 's':
384 if (isdigit(*(str+1)))
385 reboot_cpu = simple_strtoul(str+1, NULL, 0);
386 else if (str[1] == 'm' && str[2] == 'p' &&
387 isdigit(*(str+3)))
388 reboot_cpu = simple_strtoul(str+3, NULL, 0);
389 else
390 reboot_mode = REBOOT_SOFT;
391 break;
392
393 case 'g':
394 reboot_mode = REBOOT_GPIO;
395 break;
396
397 case 'b':
398 case 'a':
399 case 'k':
400 case 't':
401 case 'e':
402 case 'p':
403 reboot_type = *str;
404 break;
405
406 case 'f':
407 reboot_force = 1;
408 break;
409 }
410
411 str = strchr(str, ',');
412 if (str)
413 str++;
414 else
415 break;
416 }
417 return 1;
418}
419__setup("reboot=", reboot_setup);
diff --git a/kernel/resource.c b/kernel/resource.c
index 77bf11a86c7d..3f285dce9347 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -449,7 +449,6 @@ static int __find_resource(struct resource *root, struct resource *old,
449 struct resource *this = root->child; 449 struct resource *this = root->child;
450 struct resource tmp = *new, avail, alloc; 450 struct resource tmp = *new, avail, alloc;
451 451
452 tmp.flags = new->flags;
453 tmp.start = root->start; 452 tmp.start = root->start;
454 /* 453 /*
455 * Skip past an allocated resource that starts at 0, since the assignment 454 * Skip past an allocated resource that starts at 0, since the assignment
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 17d7065c3872..5aef494fc8b4 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -162,6 +162,39 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
162 */ 162 */
163 163
164/** 164/**
165 * cputimer_running - return true if cputimer is running
166 *
167 * @tsk: Pointer to target task.
168 */
169static inline bool cputimer_running(struct task_struct *tsk)
170
171{
172 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
173
174 if (!cputimer->running)
175 return false;
176
177 /*
178 * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
179 * in __exit_signal(), we won't account to the signal struct further
180 * cputime consumed by that task, even though the task can still be
181 * ticking after __exit_signal().
182 *
183 * In order to keep a consistent behaviour between thread group cputime
184 * and thread group cputimer accounting, lets also ignore the cputime
185 * elapsing after __exit_signal() in any thread group timer running.
186 *
187 * This makes sure that POSIX CPU clocks and timers are synchronized, so
188 * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
189 * clock delta is behind the expiring timer value.
190 */
191 if (unlikely(!tsk->sighand))
192 return false;
193
194 return true;
195}
196
197/**
165 * account_group_user_time - Maintain utime for a thread group. 198 * account_group_user_time - Maintain utime for a thread group.
166 * 199 *
167 * @tsk: Pointer to task structure. 200 * @tsk: Pointer to task structure.
@@ -176,7 +209,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
176{ 209{
177 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 210 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
178 211
179 if (!cputimer->running) 212 if (!cputimer_running(tsk))
180 return; 213 return;
181 214
182 raw_spin_lock(&cputimer->lock); 215 raw_spin_lock(&cputimer->lock);
@@ -199,7 +232,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
199{ 232{
200 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 233 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
201 234
202 if (!cputimer->running) 235 if (!cputimer_running(tsk))
203 return; 236 return;
204 237
205 raw_spin_lock(&cputimer->lock); 238 raw_spin_lock(&cputimer->lock);
@@ -222,7 +255,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
222{ 255{
223 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 256 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
224 257
225 if (!cputimer->running) 258 if (!cputimer_running(tsk))
226 return; 259 return;
227 260
228 raw_spin_lock(&cputimer->lock); 261 raw_spin_lock(&cputimer->lock);
diff --git a/kernel/signal.c b/kernel/signal.c
index 113411bfe8b1..50e41075ac77 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2848,7 +2848,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
2848 recalc_sigpending(); 2848 recalc_sigpending();
2849 spin_unlock_irq(&tsk->sighand->siglock); 2849 spin_unlock_irq(&tsk->sighand->siglock);
2850 2850
2851 timeout = schedule_timeout_interruptible(timeout); 2851 timeout = freezable_schedule_timeout_interruptible(timeout);
2852 2852
2853 spin_lock_irq(&tsk->sighand->siglock); 2853 spin_lock_irq(&tsk->sighand->siglock);
2854 __set_task_blocked(tsk, &tsk->real_blocked); 2854 __set_task_blocked(tsk, &tsk->real_blocked);
diff --git a/kernel/sys.c b/kernel/sys.c
index 2bbd9a73b54c..771129b299f8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -116,20 +116,6 @@ EXPORT_SYMBOL(fs_overflowuid);
116EXPORT_SYMBOL(fs_overflowgid); 116EXPORT_SYMBOL(fs_overflowgid);
117 117
118/* 118/*
119 * this indicates whether you can reboot with ctrl-alt-del: the default is yes
120 */
121
122int C_A_D = 1;
123struct pid *cad_pid;
124EXPORT_SYMBOL(cad_pid);
125
126/*
127 * If set, this is used for preparing the system to power off.
128 */
129
130void (*pm_power_off_prepare)(void);
131
132/*
133 * Returns true if current's euid is same as p's uid or euid, 119 * Returns true if current's euid is same as p's uid or euid,
134 * or has CAP_SYS_NICE to p's user_ns. 120 * or has CAP_SYS_NICE to p's user_ns.
135 * 121 *
@@ -308,266 +294,6 @@ out_unlock:
308 return retval; 294 return retval;
309} 295}
310 296
311/**
312 * emergency_restart - reboot the system
313 *
314 * Without shutting down any hardware or taking any locks
315 * reboot the system. This is called when we know we are in
316 * trouble so this is our best effort to reboot. This is
317 * safe to call in interrupt context.
318 */
319void emergency_restart(void)
320{
321 kmsg_dump(KMSG_DUMP_EMERG);
322 machine_emergency_restart();
323}
324EXPORT_SYMBOL_GPL(emergency_restart);
325
326void kernel_restart_prepare(char *cmd)
327{
328 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
329 system_state = SYSTEM_RESTART;
330 usermodehelper_disable();
331 device_shutdown();
332}
333
334/**
335 * register_reboot_notifier - Register function to be called at reboot time
336 * @nb: Info about notifier function to be called
337 *
338 * Registers a function with the list of functions
339 * to be called at reboot time.
340 *
341 * Currently always returns zero, as blocking_notifier_chain_register()
342 * always returns zero.
343 */
344int register_reboot_notifier(struct notifier_block *nb)
345{
346 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
347}
348EXPORT_SYMBOL(register_reboot_notifier);
349
350/**
351 * unregister_reboot_notifier - Unregister previously registered reboot notifier
352 * @nb: Hook to be unregistered
353 *
354 * Unregisters a previously registered reboot
355 * notifier function.
356 *
357 * Returns zero on success, or %-ENOENT on failure.
358 */
359int unregister_reboot_notifier(struct notifier_block *nb)
360{
361 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
362}
363EXPORT_SYMBOL(unregister_reboot_notifier);
364
365/* Add backwards compatibility for stable trees. */
366#ifndef PF_NO_SETAFFINITY
367#define PF_NO_SETAFFINITY PF_THREAD_BOUND
368#endif
369
370static void migrate_to_reboot_cpu(void)
371{
372 /* The boot cpu is always logical cpu 0 */
373 int cpu = 0;
374
375 cpu_hotplug_disable();
376
377 /* Make certain the cpu I'm about to reboot on is online */
378 if (!cpu_online(cpu))
379 cpu = cpumask_first(cpu_online_mask);
380
381 /* Prevent races with other tasks migrating this task */
382 current->flags |= PF_NO_SETAFFINITY;
383
384 /* Make certain I only run on the appropriate processor */
385 set_cpus_allowed_ptr(current, cpumask_of(cpu));
386}
387
388/**
389 * kernel_restart - reboot the system
390 * @cmd: pointer to buffer containing command to execute for restart
391 * or %NULL
392 *
393 * Shutdown everything and perform a clean reboot.
394 * This is not safe to call in interrupt context.
395 */
396void kernel_restart(char *cmd)
397{
398 kernel_restart_prepare(cmd);
399 migrate_to_reboot_cpu();
400 syscore_shutdown();
401 if (!cmd)
402 printk(KERN_EMERG "Restarting system.\n");
403 else
404 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
405 kmsg_dump(KMSG_DUMP_RESTART);
406 machine_restart(cmd);
407}
408EXPORT_SYMBOL_GPL(kernel_restart);
409
410static void kernel_shutdown_prepare(enum system_states state)
411{
412 blocking_notifier_call_chain(&reboot_notifier_list,
413 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
414 system_state = state;
415 usermodehelper_disable();
416 device_shutdown();
417}
418/**
419 * kernel_halt - halt the system
420 *
421 * Shutdown everything and perform a clean system halt.
422 */
423void kernel_halt(void)
424{
425 kernel_shutdown_prepare(SYSTEM_HALT);
426 migrate_to_reboot_cpu();
427 syscore_shutdown();
428 printk(KERN_EMERG "System halted.\n");
429 kmsg_dump(KMSG_DUMP_HALT);
430 machine_halt();
431}
432
433EXPORT_SYMBOL_GPL(kernel_halt);
434
435/**
436 * kernel_power_off - power_off the system
437 *
438 * Shutdown everything and perform a clean system power_off.
439 */
440void kernel_power_off(void)
441{
442 kernel_shutdown_prepare(SYSTEM_POWER_OFF);
443 if (pm_power_off_prepare)
444 pm_power_off_prepare();
445 migrate_to_reboot_cpu();
446 syscore_shutdown();
447 printk(KERN_EMERG "Power down.\n");
448 kmsg_dump(KMSG_DUMP_POWEROFF);
449 machine_power_off();
450}
451EXPORT_SYMBOL_GPL(kernel_power_off);
452
453static DEFINE_MUTEX(reboot_mutex);
454
455/*
456 * Reboot system call: for obvious reasons only root may call it,
457 * and even root needs to set up some magic numbers in the registers
458 * so that some mistake won't make this reboot the whole machine.
459 * You can also set the meaning of the ctrl-alt-del-key here.
460 *
461 * reboot doesn't sync: do that yourself before calling this.
462 */
463SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
464 void __user *, arg)
465{
466 struct pid_namespace *pid_ns = task_active_pid_ns(current);
467 char buffer[256];
468 int ret = 0;
469
470 /* We only trust the superuser with rebooting the system. */
471 if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
472 return -EPERM;
473
474 /* For safety, we require "magic" arguments. */
475 if (magic1 != LINUX_REBOOT_MAGIC1 ||
476 (magic2 != LINUX_REBOOT_MAGIC2 &&
477 magic2 != LINUX_REBOOT_MAGIC2A &&
478 magic2 != LINUX_REBOOT_MAGIC2B &&
479 magic2 != LINUX_REBOOT_MAGIC2C))
480 return -EINVAL;
481
482 /*
483 * If pid namespaces are enabled and the current task is in a child
484 * pid_namespace, the command is handled by reboot_pid_ns() which will
485 * call do_exit().
486 */
487 ret = reboot_pid_ns(pid_ns, cmd);
488 if (ret)
489 return ret;
490
491 /* Instead of trying to make the power_off code look like
492 * halt when pm_power_off is not set do it the easy way.
493 */
494 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
495 cmd = LINUX_REBOOT_CMD_HALT;
496
497 mutex_lock(&reboot_mutex);
498 switch (cmd) {
499 case LINUX_REBOOT_CMD_RESTART:
500 kernel_restart(NULL);
501 break;
502
503 case LINUX_REBOOT_CMD_CAD_ON:
504 C_A_D = 1;
505 break;
506
507 case LINUX_REBOOT_CMD_CAD_OFF:
508 C_A_D = 0;
509 break;
510
511 case LINUX_REBOOT_CMD_HALT:
512 kernel_halt();
513 do_exit(0);
514 panic("cannot halt");
515
516 case LINUX_REBOOT_CMD_POWER_OFF:
517 kernel_power_off();
518 do_exit(0);
519 break;
520
521 case LINUX_REBOOT_CMD_RESTART2:
522 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
523 ret = -EFAULT;
524 break;
525 }
526 buffer[sizeof(buffer) - 1] = '\0';
527
528 kernel_restart(buffer);
529 break;
530
531#ifdef CONFIG_KEXEC
532 case LINUX_REBOOT_CMD_KEXEC:
533 ret = kernel_kexec();
534 break;
535#endif
536
537#ifdef CONFIG_HIBERNATION
538 case LINUX_REBOOT_CMD_SW_SUSPEND:
539 ret = hibernate();
540 break;
541#endif
542
543 default:
544 ret = -EINVAL;
545 break;
546 }
547 mutex_unlock(&reboot_mutex);
548 return ret;
549}
550
551static void deferred_cad(struct work_struct *dummy)
552{
553 kernel_restart(NULL);
554}
555
556/*
557 * This function gets called by ctrl-alt-del - ie the keyboard interrupt.
558 * As it's called within an interrupt, it may NOT sync: the only choice
559 * is whether to reboot at once, or just ignore the ctrl-alt-del.
560 */
561void ctrl_alt_del(void)
562{
563 static DECLARE_WORK(cad_work, deferred_cad);
564
565 if (C_A_D)
566 schedule_work(&cad_work);
567 else
568 kill_cad_pid(SIGINT, 1);
569}
570
571/* 297/*
572 * Unprivileged users may change the real gid to the effective gid 298 * Unprivileged users may change the real gid to the effective gid
573 * or vice versa. (BSD-style) 299 * or vice versa. (BSD-style)
@@ -1309,6 +1035,17 @@ out:
1309 return retval; 1035 return retval;
1310} 1036}
1311 1037
1038static void set_special_pids(struct pid *pid)
1039{
1040 struct task_struct *curr = current->group_leader;
1041
1042 if (task_session(curr) != pid)
1043 change_pid(curr, PIDTYPE_SID, pid);
1044
1045 if (task_pgrp(curr) != pid)
1046 change_pid(curr, PIDTYPE_PGID, pid);
1047}
1048
1312SYSCALL_DEFINE0(setsid) 1049SYSCALL_DEFINE0(setsid)
1313{ 1050{
1314 struct task_struct *group_leader = current->group_leader; 1051 struct task_struct *group_leader = current->group_leader;
@@ -1328,7 +1065,7 @@ SYSCALL_DEFINE0(setsid)
1328 goto out; 1065 goto out;
1329 1066
1330 group_leader->signal->leader = 1; 1067 group_leader->signal->leader = 1;
1331 __set_special_pids(sid); 1068 set_special_pids(sid);
1332 1069
1333 proc_clear_tty(group_leader); 1070 proc_clear_tty(group_leader);
1334 1071
@@ -2281,68 +2018,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
2281 return err ? -EFAULT : 0; 2018 return err ? -EFAULT : 0;
2282} 2019}
2283 2020
2284char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
2285
2286static int __orderly_poweroff(bool force)
2287{
2288 char **argv;
2289 static char *envp[] = {
2290 "HOME=/",
2291 "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
2292 NULL
2293 };
2294 int ret;
2295
2296 argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
2297 if (argv) {
2298 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
2299 argv_free(argv);
2300 } else {
2301 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
2302 __func__, poweroff_cmd);
2303 ret = -ENOMEM;
2304 }
2305
2306 if (ret && force) {
2307 printk(KERN_WARNING "Failed to start orderly shutdown: "
2308 "forcing the issue\n");
2309 /*
2310 * I guess this should try to kick off some daemon to sync and
2311 * poweroff asap. Or not even bother syncing if we're doing an
2312 * emergency shutdown?
2313 */
2314 emergency_sync();
2315 kernel_power_off();
2316 }
2317
2318 return ret;
2319}
2320
2321static bool poweroff_force;
2322
2323static void poweroff_work_func(struct work_struct *work)
2324{
2325 __orderly_poweroff(poweroff_force);
2326}
2327
2328static DECLARE_WORK(poweroff_work, poweroff_work_func);
2329
2330/**
2331 * orderly_poweroff - Trigger an orderly system poweroff
2332 * @force: force poweroff if command execution fails
2333 *
2334 * This may be called from any context to trigger a system shutdown.
2335 * If the orderly shutdown fails, it will force an immediate shutdown.
2336 */
2337int orderly_poweroff(bool force)
2338{
2339 if (force) /* do not override the pending "true" */
2340 poweroff_force = true;
2341 schedule_work(&poweroff_work);
2342 return 0;
2343}
2344EXPORT_SYMBOL_GPL(orderly_poweroff);
2345
2346/** 2021/**
2347 * do_sysinfo - fill in sysinfo struct 2022 * do_sysinfo - fill in sysinfo struct
2348 * @info: pointer to buffer to fill 2023 * @info: pointer to buffer to fill
@@ -2355,8 +2030,7 @@ static int do_sysinfo(struct sysinfo *info)
2355 2030
2356 memset(info, 0, sizeof(struct sysinfo)); 2031 memset(info, 0, sizeof(struct sysinfo));
2357 2032
2358 ktime_get_ts(&tp); 2033 get_monotonic_boottime(&tp);
2359 monotonic_to_bootbased(&tp);
2360 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); 2034 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
2361 2035
2362 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); 2036 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5c9e33b5c0eb..ac09d98490aa 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -599,6 +599,13 @@ static struct ctl_table kern_table[] = {
599 .mode = 0644, 599 .mode = 0644,
600 .proc_handler = proc_dointvec, 600 .proc_handler = proc_dointvec,
601 }, 601 },
602 {
603 .procname = "traceoff_on_warning",
604 .data = &__disable_trace_on_warning,
605 .maxlen = sizeof(__disable_trace_on_warning),
606 .mode = 0644,
607 .proc_handler = proc_dointvec,
608 },
602#endif 609#endif
603#ifdef CONFIG_MODULES 610#ifdef CONFIG_MODULES
604 { 611 {
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index aea4a9ea6fc8..b609213ca9a2 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -3,7 +3,6 @@
3#include "../fs/xfs/xfs_sysctl.h" 3#include "../fs/xfs/xfs_sysctl.h"
4#include <linux/sunrpc/debug.h> 4#include <linux/sunrpc/debug.h>
5#include <linux/string.h> 5#include <linux/string.h>
6#include <net/ip_vs.h>
7#include <linux/syscalls.h> 6#include <linux/syscalls.h>
8#include <linux/namei.h> 7#include <linux/namei.h>
9#include <linux/mount.h> 8#include <linux/mount.h>
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ff7d9d2ab504..9250130646f5 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -4,6 +4,8 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
6obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o 6obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o
7obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
7obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o 8obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
8obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o 9obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
9obj-$(CONFIG_TIMER_STATS) += timer_stats.o 10obj-$(CONFIG_TIMER_STATS) += timer_stats.o
11obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index f11d83b12949..eec50fcef9e4 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -199,6 +199,13 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
199 199
200} 200}
201 201
202ktime_t alarm_expires_remaining(const struct alarm *alarm)
203{
204 struct alarm_base *base = &alarm_bases[alarm->type];
205 return ktime_sub(alarm->node.expires, base->gettime());
206}
207EXPORT_SYMBOL_GPL(alarm_expires_remaining);
208
202#ifdef CONFIG_RTC_CLASS 209#ifdef CONFIG_RTC_CLASS
203/** 210/**
204 * alarmtimer_suspend - Suspend time callback 211 * alarmtimer_suspend - Suspend time callback
@@ -303,9 +310,10 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
303 alarm->type = type; 310 alarm->type = type;
304 alarm->state = ALARMTIMER_STATE_INACTIVE; 311 alarm->state = ALARMTIMER_STATE_INACTIVE;
305} 312}
313EXPORT_SYMBOL_GPL(alarm_init);
306 314
307/** 315/**
308 * alarm_start - Sets an alarm to fire 316 * alarm_start - Sets an absolute alarm to fire
309 * @alarm: ptr to alarm to set 317 * @alarm: ptr to alarm to set
310 * @start: time to run the alarm 318 * @start: time to run the alarm
311 */ 319 */
@@ -323,6 +331,34 @@ int alarm_start(struct alarm *alarm, ktime_t start)
323 spin_unlock_irqrestore(&base->lock, flags); 331 spin_unlock_irqrestore(&base->lock, flags);
324 return ret; 332 return ret;
325} 333}
334EXPORT_SYMBOL_GPL(alarm_start);
335
336/**
337 * alarm_start_relative - Sets a relative alarm to fire
338 * @alarm: ptr to alarm to set
339 * @start: time relative to now to run the alarm
340 */
341int alarm_start_relative(struct alarm *alarm, ktime_t start)
342{
343 struct alarm_base *base = &alarm_bases[alarm->type];
344
345 start = ktime_add(start, base->gettime());
346 return alarm_start(alarm, start);
347}
348EXPORT_SYMBOL_GPL(alarm_start_relative);
349
350void alarm_restart(struct alarm *alarm)
351{
352 struct alarm_base *base = &alarm_bases[alarm->type];
353 unsigned long flags;
354
355 spin_lock_irqsave(&base->lock, flags);
356 hrtimer_set_expires(&alarm->timer, alarm->node.expires);
357 hrtimer_restart(&alarm->timer);
358 alarmtimer_enqueue(base, alarm);
359 spin_unlock_irqrestore(&base->lock, flags);
360}
361EXPORT_SYMBOL_GPL(alarm_restart);
326 362
327/** 363/**
328 * alarm_try_to_cancel - Tries to cancel an alarm timer 364 * alarm_try_to_cancel - Tries to cancel an alarm timer
@@ -344,6 +380,7 @@ int alarm_try_to_cancel(struct alarm *alarm)
344 spin_unlock_irqrestore(&base->lock, flags); 380 spin_unlock_irqrestore(&base->lock, flags);
345 return ret; 381 return ret;
346} 382}
383EXPORT_SYMBOL_GPL(alarm_try_to_cancel);
347 384
348 385
349/** 386/**
@@ -361,6 +398,7 @@ int alarm_cancel(struct alarm *alarm)
361 cpu_relax(); 398 cpu_relax();
362 } 399 }
363} 400}
401EXPORT_SYMBOL_GPL(alarm_cancel);
364 402
365 403
366u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) 404u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
@@ -393,8 +431,15 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
393 alarm->node.expires = ktime_add(alarm->node.expires, interval); 431 alarm->node.expires = ktime_add(alarm->node.expires, interval);
394 return overrun; 432 return overrun;
395} 433}
434EXPORT_SYMBOL_GPL(alarm_forward);
396 435
436u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
437{
438 struct alarm_base *base = &alarm_bases[alarm->type];
397 439
440 return alarm_forward(alarm, base->gettime(), interval);
441}
442EXPORT_SYMBOL_GPL(alarm_forward_now);
398 443
399 444
400/** 445/**
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index c6d6400ee137..38959c866789 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -15,20 +15,23 @@
15#include <linux/hrtimer.h> 15#include <linux/hrtimer.h>
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/notifier.h>
19#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/device.h>
20 20
21#include "tick-internal.h" 21#include "tick-internal.h"
22 22
23/* The registered clock event devices */ 23/* The registered clock event devices */
24static LIST_HEAD(clockevent_devices); 24static LIST_HEAD(clockevent_devices);
25static LIST_HEAD(clockevents_released); 25static LIST_HEAD(clockevents_released);
26
27/* Notification for clock events */
28static RAW_NOTIFIER_HEAD(clockevents_chain);
29
30/* Protection for the above */ 26/* Protection for the above */
31static DEFINE_RAW_SPINLOCK(clockevents_lock); 27static DEFINE_RAW_SPINLOCK(clockevents_lock);
28/* Protection for unbind operations */
29static DEFINE_MUTEX(clockevents_mutex);
30
31struct ce_unbind {
32 struct clock_event_device *ce;
33 int res;
34};
32 35
33/** 36/**
34 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds 37 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
@@ -232,47 +235,107 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
232 return (rc && force) ? clockevents_program_min_delta(dev) : rc; 235 return (rc && force) ? clockevents_program_min_delta(dev) : rc;
233} 236}
234 237
235/** 238/*
236 * clockevents_register_notifier - register a clock events change listener 239 * Called after a notify add to make devices available which were
240 * released from the notifier call.
237 */ 241 */
238int clockevents_register_notifier(struct notifier_block *nb) 242static void clockevents_notify_released(void)
239{ 243{
240 unsigned long flags; 244 struct clock_event_device *dev;
241 int ret;
242 245
243 raw_spin_lock_irqsave(&clockevents_lock, flags); 246 while (!list_empty(&clockevents_released)) {
244 ret = raw_notifier_chain_register(&clockevents_chain, nb); 247 dev = list_entry(clockevents_released.next,
245 raw_spin_unlock_irqrestore(&clockevents_lock, flags); 248 struct clock_event_device, list);
249 list_del(&dev->list);
250 list_add(&dev->list, &clockevent_devices);
251 tick_check_new_device(dev);
252 }
253}
246 254
247 return ret; 255/*
256 * Try to install a replacement clock event device
257 */
258static int clockevents_replace(struct clock_event_device *ced)
259{
260 struct clock_event_device *dev, *newdev = NULL;
261
262 list_for_each_entry(dev, &clockevent_devices, list) {
263 if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED)
264 continue;
265
266 if (!tick_check_replacement(newdev, dev))
267 continue;
268
269 if (!try_module_get(dev->owner))
270 continue;
271
272 if (newdev)
273 module_put(newdev->owner);
274 newdev = dev;
275 }
276 if (newdev) {
277 tick_install_replacement(newdev);
278 list_del_init(&ced->list);
279 }
280 return newdev ? 0 : -EBUSY;
248} 281}
249 282
250/* 283/*
251 * Notify about a clock event change. Called with clockevents_lock 284 * Called with clockevents_mutex and clockevents_lock held
252 * held.
253 */ 285 */
254static void clockevents_do_notify(unsigned long reason, void *dev) 286static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
255{ 287{
256 raw_notifier_call_chain(&clockevents_chain, reason, dev); 288 /* Fast track. Device is unused */
289 if (ced->mode == CLOCK_EVT_MODE_UNUSED) {
290 list_del_init(&ced->list);
291 return 0;
292 }
293
294 return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY;
257} 295}
258 296
259/* 297/*
260 * Called after a notify add to make devices available which were 298 * SMP function call to unbind a device
261 * released from the notifier call.
262 */ 299 */
263static void clockevents_notify_released(void) 300static void __clockevents_unbind(void *arg)
264{ 301{
265 struct clock_event_device *dev; 302 struct ce_unbind *cu = arg;
303 int res;
304
305 raw_spin_lock(&clockevents_lock);
306 res = __clockevents_try_unbind(cu->ce, smp_processor_id());
307 if (res == -EAGAIN)
308 res = clockevents_replace(cu->ce);
309 cu->res = res;
310 raw_spin_unlock(&clockevents_lock);
311}
266 312
267 while (!list_empty(&clockevents_released)) { 313/*
268 dev = list_entry(clockevents_released.next, 314 * Issues smp function call to unbind a per cpu device. Called with
269 struct clock_event_device, list); 315 * clockevents_mutex held.
270 list_del(&dev->list); 316 */
271 list_add(&dev->list, &clockevent_devices); 317static int clockevents_unbind(struct clock_event_device *ced, int cpu)
272 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); 318{
273 } 319 struct ce_unbind cu = { .ce = ced, .res = -ENODEV };
320
321 smp_call_function_single(cpu, __clockevents_unbind, &cu, 1);
322 return cu.res;
274} 323}
275 324
325/*
326 * Unbind a clockevents device.
327 */
328int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
329{
330 int ret;
331
332 mutex_lock(&clockevents_mutex);
333 ret = clockevents_unbind(ced, cpu);
334 mutex_unlock(&clockevents_mutex);
335 return ret;
336}
337EXPORT_SYMBOL_GPL(clockevents_unbind);
338
276/** 339/**
277 * clockevents_register_device - register a clock event device 340 * clockevents_register_device - register a clock event device
278 * @dev: device to register 341 * @dev: device to register
@@ -290,7 +353,7 @@ void clockevents_register_device(struct clock_event_device *dev)
290 raw_spin_lock_irqsave(&clockevents_lock, flags); 353 raw_spin_lock_irqsave(&clockevents_lock, flags);
291 354
292 list_add(&dev->list, &clockevent_devices); 355 list_add(&dev->list, &clockevent_devices);
293 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); 356 tick_check_new_device(dev);
294 clockevents_notify_released(); 357 clockevents_notify_released();
295 358
296 raw_spin_unlock_irqrestore(&clockevents_lock, flags); 359 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
@@ -386,6 +449,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
386 * released list and do a notify add later. 449 * released list and do a notify add later.
387 */ 450 */
388 if (old) { 451 if (old) {
452 module_put(old->owner);
389 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); 453 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
390 list_del(&old->list); 454 list_del(&old->list);
391 list_add(&old->list, &clockevents_released); 455 list_add(&old->list, &clockevents_released);
@@ -433,10 +497,36 @@ void clockevents_notify(unsigned long reason, void *arg)
433 int cpu; 497 int cpu;
434 498
435 raw_spin_lock_irqsave(&clockevents_lock, flags); 499 raw_spin_lock_irqsave(&clockevents_lock, flags);
436 clockevents_do_notify(reason, arg);
437 500
438 switch (reason) { 501 switch (reason) {
502 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
503 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
504 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
505 tick_broadcast_on_off(reason, arg);
506 break;
507
508 case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
509 case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
510 tick_broadcast_oneshot_control(reason);
511 break;
512
513 case CLOCK_EVT_NOTIFY_CPU_DYING:
514 tick_handover_do_timer(arg);
515 break;
516
517 case CLOCK_EVT_NOTIFY_SUSPEND:
518 tick_suspend();
519 tick_suspend_broadcast();
520 break;
521
522 case CLOCK_EVT_NOTIFY_RESUME:
523 tick_resume();
524 break;
525
439 case CLOCK_EVT_NOTIFY_CPU_DEAD: 526 case CLOCK_EVT_NOTIFY_CPU_DEAD:
527 tick_shutdown_broadcast_oneshot(arg);
528 tick_shutdown_broadcast(arg);
529 tick_shutdown(arg);
440 /* 530 /*
441 * Unregister the clock event devices which were 531 * Unregister the clock event devices which were
442 * released from the users in the notify chain. 532 * released from the users in the notify chain.
@@ -462,4 +552,123 @@ void clockevents_notify(unsigned long reason, void *arg)
462 raw_spin_unlock_irqrestore(&clockevents_lock, flags); 552 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
463} 553}
464EXPORT_SYMBOL_GPL(clockevents_notify); 554EXPORT_SYMBOL_GPL(clockevents_notify);
555
556#ifdef CONFIG_SYSFS
557struct bus_type clockevents_subsys = {
558 .name = "clockevents",
559 .dev_name = "clockevent",
560};
561
562static DEFINE_PER_CPU(struct device, tick_percpu_dev);
563static struct tick_device *tick_get_tick_dev(struct device *dev);
564
565static ssize_t sysfs_show_current_tick_dev(struct device *dev,
566 struct device_attribute *attr,
567 char *buf)
568{
569 struct tick_device *td;
570 ssize_t count = 0;
571
572 raw_spin_lock_irq(&clockevents_lock);
573 td = tick_get_tick_dev(dev);
574 if (td && td->evtdev)
575 count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name);
576 raw_spin_unlock_irq(&clockevents_lock);
577 return count;
578}
579static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL);
580
581/* We don't support the abomination of removable broadcast devices */
582static ssize_t sysfs_unbind_tick_dev(struct device *dev,
583 struct device_attribute *attr,
584 const char *buf, size_t count)
585{
586 char name[CS_NAME_LEN];
587 size_t ret = sysfs_get_uname(buf, name, count);
588 struct clock_event_device *ce;
589
590 if (ret < 0)
591 return ret;
592
593 ret = -ENODEV;
594 mutex_lock(&clockevents_mutex);
595 raw_spin_lock_irq(&clockevents_lock);
596 list_for_each_entry(ce, &clockevent_devices, list) {
597 if (!strcmp(ce->name, name)) {
598 ret = __clockevents_try_unbind(ce, dev->id);
599 break;
600 }
601 }
602 raw_spin_unlock_irq(&clockevents_lock);
603 /*
604 * We hold clockevents_mutex, so ce can't go away
605 */
606 if (ret == -EAGAIN)
607 ret = clockevents_unbind(ce, dev->id);
608 mutex_unlock(&clockevents_mutex);
609 return ret ? ret : count;
610}
611static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev);
612
613#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
614static struct device tick_bc_dev = {
615 .init_name = "broadcast",
616 .id = 0,
617 .bus = &clockevents_subsys,
618};
619
620static struct tick_device *tick_get_tick_dev(struct device *dev)
621{
622 return dev == &tick_bc_dev ? tick_get_broadcast_device() :
623 &per_cpu(tick_cpu_device, dev->id);
624}
625
626static __init int tick_broadcast_init_sysfs(void)
627{
628 int err = device_register(&tick_bc_dev);
629
630 if (!err)
631 err = device_create_file(&tick_bc_dev, &dev_attr_current_device);
632 return err;
633}
634#else
635static struct tick_device *tick_get_tick_dev(struct device *dev)
636{
637 return &per_cpu(tick_cpu_device, dev->id);
638}
639static inline int tick_broadcast_init_sysfs(void) { return 0; }
465#endif 640#endif
641
642static int __init tick_init_sysfs(void)
643{
644 int cpu;
645
646 for_each_possible_cpu(cpu) {
647 struct device *dev = &per_cpu(tick_percpu_dev, cpu);
648 int err;
649
650 dev->id = cpu;
651 dev->bus = &clockevents_subsys;
652 err = device_register(dev);
653 if (!err)
654 err = device_create_file(dev, &dev_attr_current_device);
655 if (!err)
656 err = device_create_file(dev, &dev_attr_unbind_device);
657 if (err)
658 return err;
659 }
660 return tick_broadcast_init_sysfs();
661}
662
663static int __init clockevents_init_sysfs(void)
664{
665 int err = subsys_system_register(&clockevents_subsys, NULL);
666
667 if (!err)
668 err = tick_init_sysfs();
669 return err;
670}
671device_initcall(clockevents_init_sysfs);
672#endif /* SYSFS */
673
674#endif /* GENERIC_CLOCK_EVENTS */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c9583382141a..50a8736757f3 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -31,6 +31,8 @@
31#include <linux/tick.h> 31#include <linux/tick.h>
32#include <linux/kthread.h> 32#include <linux/kthread.h>
33 33
34#include "tick-internal.h"
35
34void timecounter_init(struct timecounter *tc, 36void timecounter_init(struct timecounter *tc,
35 const struct cyclecounter *cc, 37 const struct cyclecounter *cc,
36 u64 start_tstamp) 38 u64 start_tstamp)
@@ -174,11 +176,12 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
174static struct clocksource *curr_clocksource; 176static struct clocksource *curr_clocksource;
175static LIST_HEAD(clocksource_list); 177static LIST_HEAD(clocksource_list);
176static DEFINE_MUTEX(clocksource_mutex); 178static DEFINE_MUTEX(clocksource_mutex);
177static char override_name[32]; 179static char override_name[CS_NAME_LEN];
178static int finished_booting; 180static int finished_booting;
179 181
180#ifdef CONFIG_CLOCKSOURCE_WATCHDOG 182#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
181static void clocksource_watchdog_work(struct work_struct *work); 183static void clocksource_watchdog_work(struct work_struct *work);
184static void clocksource_select(void);
182 185
183static LIST_HEAD(watchdog_list); 186static LIST_HEAD(watchdog_list);
184static struct clocksource *watchdog; 187static struct clocksource *watchdog;
@@ -299,13 +302,30 @@ static void clocksource_watchdog(unsigned long data)
299 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && 302 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
300 (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && 303 (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
301 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { 304 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
305 /* Mark it valid for high-res. */
302 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 306 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
307
308 /*
309 * clocksource_done_booting() will sort it if
310 * finished_booting is not set yet.
311 */
312 if (!finished_booting)
313 continue;
314
303 /* 315 /*
304 * We just marked the clocksource as highres-capable, 316 * If this is not the current clocksource let
305 * notify the rest of the system as well so that we 317 * the watchdog thread reselect it. Due to the
306 * transition into high-res mode: 318 * change to high res this clocksource might
319 * be preferred now. If it is the current
320 * clocksource let the tick code know about
321 * that change.
307 */ 322 */
308 tick_clock_notify(); 323 if (cs != curr_clocksource) {
324 cs->flags |= CLOCK_SOURCE_RESELECT;
325 schedule_work(&watchdog_work);
326 } else {
327 tick_clock_notify();
328 }
309 } 329 }
310 } 330 }
311 331
@@ -388,44 +408,39 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
388 408
389static void clocksource_dequeue_watchdog(struct clocksource *cs) 409static void clocksource_dequeue_watchdog(struct clocksource *cs)
390{ 410{
391 struct clocksource *tmp;
392 unsigned long flags; 411 unsigned long flags;
393 412
394 spin_lock_irqsave(&watchdog_lock, flags); 413 spin_lock_irqsave(&watchdog_lock, flags);
395 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { 414 if (cs != watchdog) {
396 /* cs is a watched clocksource. */ 415 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
397 list_del_init(&cs->wd_list); 416 /* cs is a watched clocksource. */
398 } else if (cs == watchdog) { 417 list_del_init(&cs->wd_list);
399 /* Reset watchdog cycles */ 418 /* Check if the watchdog timer needs to be stopped. */
400 clocksource_reset_watchdog(); 419 clocksource_stop_watchdog();
401 /* Current watchdog is removed. Find an alternative. */
402 watchdog = NULL;
403 list_for_each_entry(tmp, &clocksource_list, list) {
404 if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY)
405 continue;
406 if (!watchdog || tmp->rating > watchdog->rating)
407 watchdog = tmp;
408 } 420 }
409 } 421 }
410 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
411 /* Check if the watchdog timer needs to be stopped. */
412 clocksource_stop_watchdog();
413 spin_unlock_irqrestore(&watchdog_lock, flags); 422 spin_unlock_irqrestore(&watchdog_lock, flags);
414} 423}
415 424
416static int clocksource_watchdog_kthread(void *data) 425static int __clocksource_watchdog_kthread(void)
417{ 426{
418 struct clocksource *cs, *tmp; 427 struct clocksource *cs, *tmp;
419 unsigned long flags; 428 unsigned long flags;
420 LIST_HEAD(unstable); 429 LIST_HEAD(unstable);
430 int select = 0;
421 431
422 mutex_lock(&clocksource_mutex);
423 spin_lock_irqsave(&watchdog_lock, flags); 432 spin_lock_irqsave(&watchdog_lock, flags);
424 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) 433 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
425 if (cs->flags & CLOCK_SOURCE_UNSTABLE) { 434 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
426 list_del_init(&cs->wd_list); 435 list_del_init(&cs->wd_list);
427 list_add(&cs->wd_list, &unstable); 436 list_add(&cs->wd_list, &unstable);
437 select = 1;
428 } 438 }
439 if (cs->flags & CLOCK_SOURCE_RESELECT) {
440 cs->flags &= ~CLOCK_SOURCE_RESELECT;
441 select = 1;
442 }
443 }
429 /* Check if the watchdog timer needs to be stopped. */ 444 /* Check if the watchdog timer needs to be stopped. */
430 clocksource_stop_watchdog(); 445 clocksource_stop_watchdog();
431 spin_unlock_irqrestore(&watchdog_lock, flags); 446 spin_unlock_irqrestore(&watchdog_lock, flags);
@@ -435,10 +450,23 @@ static int clocksource_watchdog_kthread(void *data)
435 list_del_init(&cs->wd_list); 450 list_del_init(&cs->wd_list);
436 __clocksource_change_rating(cs, 0); 451 __clocksource_change_rating(cs, 0);
437 } 452 }
453 return select;
454}
455
456static int clocksource_watchdog_kthread(void *data)
457{
458 mutex_lock(&clocksource_mutex);
459 if (__clocksource_watchdog_kthread())
460 clocksource_select();
438 mutex_unlock(&clocksource_mutex); 461 mutex_unlock(&clocksource_mutex);
439 return 0; 462 return 0;
440} 463}
441 464
465static bool clocksource_is_watchdog(struct clocksource *cs)
466{
467 return cs == watchdog;
468}
469
442#else /* CONFIG_CLOCKSOURCE_WATCHDOG */ 470#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
443 471
444static void clocksource_enqueue_watchdog(struct clocksource *cs) 472static void clocksource_enqueue_watchdog(struct clocksource *cs)
@@ -449,7 +477,8 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
449 477
450static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } 478static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
451static inline void clocksource_resume_watchdog(void) { } 479static inline void clocksource_resume_watchdog(void) { }
452static inline int clocksource_watchdog_kthread(void *data) { return 0; } 480static inline int __clocksource_watchdog_kthread(void) { return 0; }
481static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
453 482
454#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ 483#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
455 484
@@ -553,24 +582,42 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
553 582
554#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET 583#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
555 584
556/** 585static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
557 * clocksource_select - Select the best clocksource available
558 *
559 * Private function. Must hold clocksource_mutex when called.
560 *
561 * Select the clocksource with the best rating, or the clocksource,
562 * which is selected by userspace override.
563 */
564static void clocksource_select(void)
565{ 586{
566 struct clocksource *best, *cs; 587 struct clocksource *cs;
567 588
568 if (!finished_booting || list_empty(&clocksource_list)) 589 if (!finished_booting || list_empty(&clocksource_list))
590 return NULL;
591
592 /*
593 * We pick the clocksource with the highest rating. If oneshot
594 * mode is active, we pick the highres valid clocksource with
595 * the best rating.
596 */
597 list_for_each_entry(cs, &clocksource_list, list) {
598 if (skipcur && cs == curr_clocksource)
599 continue;
600 if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES))
601 continue;
602 return cs;
603 }
604 return NULL;
605}
606
607static void __clocksource_select(bool skipcur)
608{
609 bool oneshot = tick_oneshot_mode_active();
610 struct clocksource *best, *cs;
611
612 /* Find the best suitable clocksource */
613 best = clocksource_find_best(oneshot, skipcur);
614 if (!best)
569 return; 615 return;
570 /* First clocksource on the list has the best rating. */ 616
571 best = list_first_entry(&clocksource_list, struct clocksource, list);
572 /* Check for the override clocksource. */ 617 /* Check for the override clocksource. */
573 list_for_each_entry(cs, &clocksource_list, list) { 618 list_for_each_entry(cs, &clocksource_list, list) {
619 if (skipcur && cs == curr_clocksource)
620 continue;
574 if (strcmp(cs->name, override_name) != 0) 621 if (strcmp(cs->name, override_name) != 0)
575 continue; 622 continue;
576 /* 623 /*
@@ -578,8 +625,7 @@ static void clocksource_select(void)
578 * capable clocksource if the tick code is in oneshot 625 * capable clocksource if the tick code is in oneshot
579 * mode (highres or nohz) 626 * mode (highres or nohz)
580 */ 627 */
581 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && 628 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
582 tick_oneshot_mode_active()) {
583 /* Override clocksource cannot be used. */ 629 /* Override clocksource cannot be used. */
584 printk(KERN_WARNING "Override clocksource %s is not " 630 printk(KERN_WARNING "Override clocksource %s is not "
585 "HRT compatible. Cannot switch while in " 631 "HRT compatible. Cannot switch while in "
@@ -590,16 +636,35 @@ static void clocksource_select(void)
590 best = cs; 636 best = cs;
591 break; 637 break;
592 } 638 }
593 if (curr_clocksource != best) { 639
594 printk(KERN_INFO "Switching to clocksource %s\n", best->name); 640 if (curr_clocksource != best && !timekeeping_notify(best)) {
641 pr_info("Switched to clocksource %s\n", best->name);
595 curr_clocksource = best; 642 curr_clocksource = best;
596 timekeeping_notify(curr_clocksource);
597 } 643 }
598} 644}
599 645
646/**
647 * clocksource_select - Select the best clocksource available
648 *
649 * Private function. Must hold clocksource_mutex when called.
650 *
651 * Select the clocksource with the best rating, or the clocksource,
652 * which is selected by userspace override.
653 */
654static void clocksource_select(void)
655{
656 return __clocksource_select(false);
657}
658
659static void clocksource_select_fallback(void)
660{
661 return __clocksource_select(true);
662}
663
600#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ 664#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
601 665
602static inline void clocksource_select(void) { } 666static inline void clocksource_select(void) { }
667static inline void clocksource_select_fallback(void) { }
603 668
604#endif 669#endif
605 670
@@ -614,16 +679,11 @@ static int __init clocksource_done_booting(void)
614{ 679{
615 mutex_lock(&clocksource_mutex); 680 mutex_lock(&clocksource_mutex);
616 curr_clocksource = clocksource_default_clock(); 681 curr_clocksource = clocksource_default_clock();
617 mutex_unlock(&clocksource_mutex);
618
619 finished_booting = 1; 682 finished_booting = 1;
620
621 /* 683 /*
622 * Run the watchdog first to eliminate unstable clock sources 684 * Run the watchdog first to eliminate unstable clock sources
623 */ 685 */
624 clocksource_watchdog_kthread(NULL); 686 __clocksource_watchdog_kthread();
625
626 mutex_lock(&clocksource_mutex);
627 clocksource_select(); 687 clocksource_select();
628 mutex_unlock(&clocksource_mutex); 688 mutex_unlock(&clocksource_mutex);
629 return 0; 689 return 0;
@@ -756,7 +816,6 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating)
756 list_del(&cs->list); 816 list_del(&cs->list);
757 cs->rating = rating; 817 cs->rating = rating;
758 clocksource_enqueue(cs); 818 clocksource_enqueue(cs);
759 clocksource_select();
760} 819}
761 820
762/** 821/**
@@ -768,21 +827,47 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
768{ 827{
769 mutex_lock(&clocksource_mutex); 828 mutex_lock(&clocksource_mutex);
770 __clocksource_change_rating(cs, rating); 829 __clocksource_change_rating(cs, rating);
830 clocksource_select();
771 mutex_unlock(&clocksource_mutex); 831 mutex_unlock(&clocksource_mutex);
772} 832}
773EXPORT_SYMBOL(clocksource_change_rating); 833EXPORT_SYMBOL(clocksource_change_rating);
774 834
835/*
836 * Unbind clocksource @cs. Called with clocksource_mutex held
837 */
838static int clocksource_unbind(struct clocksource *cs)
839{
840 /*
841 * I really can't convince myself to support this on hardware
842 * designed by lobotomized monkeys.
843 */
844 if (clocksource_is_watchdog(cs))
845 return -EBUSY;
846
847 if (cs == curr_clocksource) {
848 /* Select and try to install a replacement clock source */
849 clocksource_select_fallback();
850 if (curr_clocksource == cs)
851 return -EBUSY;
852 }
853 clocksource_dequeue_watchdog(cs);
854 list_del_init(&cs->list);
855 return 0;
856}
857
775/** 858/**
776 * clocksource_unregister - remove a registered clocksource 859 * clocksource_unregister - remove a registered clocksource
777 * @cs: clocksource to be unregistered 860 * @cs: clocksource to be unregistered
778 */ 861 */
779void clocksource_unregister(struct clocksource *cs) 862int clocksource_unregister(struct clocksource *cs)
780{ 863{
864 int ret = 0;
865
781 mutex_lock(&clocksource_mutex); 866 mutex_lock(&clocksource_mutex);
782 clocksource_dequeue_watchdog(cs); 867 if (!list_empty(&cs->list))
783 list_del(&cs->list); 868 ret = clocksource_unbind(cs);
784 clocksource_select();
785 mutex_unlock(&clocksource_mutex); 869 mutex_unlock(&clocksource_mutex);
870 return ret;
786} 871}
787EXPORT_SYMBOL(clocksource_unregister); 872EXPORT_SYMBOL(clocksource_unregister);
788 873
@@ -808,6 +893,23 @@ sysfs_show_current_clocksources(struct device *dev,
808 return count; 893 return count;
809} 894}
810 895
896size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
897{
898 size_t ret = cnt;
899
900 /* strings from sysfs write are not 0 terminated! */
901 if (!cnt || cnt >= CS_NAME_LEN)
902 return -EINVAL;
903
904 /* strip of \n: */
905 if (buf[cnt-1] == '\n')
906 cnt--;
907 if (cnt > 0)
908 memcpy(dst, buf, cnt);
909 dst[cnt] = 0;
910 return ret;
911}
912
811/** 913/**
812 * sysfs_override_clocksource - interface for manually overriding clocksource 914 * sysfs_override_clocksource - interface for manually overriding clocksource
813 * @dev: unused 915 * @dev: unused
@@ -822,22 +924,13 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
822 struct device_attribute *attr, 924 struct device_attribute *attr,
823 const char *buf, size_t count) 925 const char *buf, size_t count)
824{ 926{
825 size_t ret = count; 927 size_t ret;
826
827 /* strings from sysfs write are not 0 terminated! */
828 if (count >= sizeof(override_name))
829 return -EINVAL;
830
831 /* strip of \n: */
832 if (buf[count-1] == '\n')
833 count--;
834 928
835 mutex_lock(&clocksource_mutex); 929 mutex_lock(&clocksource_mutex);
836 930
837 if (count > 0) 931 ret = sysfs_get_uname(buf, override_name, count);
838 memcpy(override_name, buf, count); 932 if (ret >= 0)
839 override_name[count] = 0; 933 clocksource_select();
840 clocksource_select();
841 934
842 mutex_unlock(&clocksource_mutex); 935 mutex_unlock(&clocksource_mutex);
843 936
@@ -845,6 +938,40 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
845} 938}
846 939
847/** 940/**
941 * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource
942 * @dev: unused
943 * @attr: unused
944 * @buf: unused
945 * @count: length of buffer
946 *
947 * Takes input from sysfs interface for manually unbinding a clocksource.
948 */
949static ssize_t sysfs_unbind_clocksource(struct device *dev,
950 struct device_attribute *attr,
951 const char *buf, size_t count)
952{
953 struct clocksource *cs;
954 char name[CS_NAME_LEN];
955 size_t ret;
956
957 ret = sysfs_get_uname(buf, name, count);
958 if (ret < 0)
959 return ret;
960
961 ret = -ENODEV;
962 mutex_lock(&clocksource_mutex);
963 list_for_each_entry(cs, &clocksource_list, list) {
964 if (strcmp(cs->name, name))
965 continue;
966 ret = clocksource_unbind(cs);
967 break;
968 }
969 mutex_unlock(&clocksource_mutex);
970
971 return ret ? ret : count;
972}
973
974/**
848 * sysfs_show_available_clocksources - sysfs interface for listing clocksource 975 * sysfs_show_available_clocksources - sysfs interface for listing clocksource
849 * @dev: unused 976 * @dev: unused
850 * @attr: unused 977 * @attr: unused
@@ -886,6 +1013,8 @@ sysfs_show_available_clocksources(struct device *dev,
886static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, 1013static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
887 sysfs_override_clocksource); 1014 sysfs_override_clocksource);
888 1015
1016static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource);
1017
889static DEVICE_ATTR(available_clocksource, 0444, 1018static DEVICE_ATTR(available_clocksource, 0444,
890 sysfs_show_available_clocksources, NULL); 1019 sysfs_show_available_clocksources, NULL);
891 1020
@@ -910,6 +1039,9 @@ static int __init init_clocksource_sysfs(void)
910 &device_clocksource, 1039 &device_clocksource,
911 &dev_attr_current_clocksource); 1040 &dev_attr_current_clocksource);
912 if (!error) 1041 if (!error)
1042 error = device_create_file(&device_clocksource,
1043 &dev_attr_unbind_clocksource);
1044 if (!error)
913 error = device_create_file( 1045 error = device_create_file(
914 &device_clocksource, 1046 &device_clocksource,
915 &dev_attr_available_clocksource); 1047 &dev_attr_available_clocksource);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
new file mode 100644
index 000000000000..a326f27d7f09
--- /dev/null
+++ b/kernel/time/sched_clock.c
@@ -0,0 +1,212 @@
1/*
2 * sched_clock.c: support for extending counters to full 64-bit ns counter
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8#include <linux/clocksource.h>
9#include <linux/init.h>
10#include <linux/jiffies.h>
11#include <linux/kernel.h>
12#include <linux/moduleparam.h>
13#include <linux/sched.h>
14#include <linux/syscore_ops.h>
15#include <linux/timer.h>
16#include <linux/sched_clock.h>
17
18struct clock_data {
19 u64 epoch_ns;
20 u32 epoch_cyc;
21 u32 epoch_cyc_copy;
22 unsigned long rate;
23 u32 mult;
24 u32 shift;
25 bool suspended;
26};
27
28static void sched_clock_poll(unsigned long wrap_ticks);
29static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0);
30static int irqtime = -1;
31
32core_param(irqtime, irqtime, int, 0400);
33
34static struct clock_data cd = {
35 .mult = NSEC_PER_SEC / HZ,
36};
37
38static u32 __read_mostly sched_clock_mask = 0xffffffff;
39
40static u32 notrace jiffy_sched_clock_read(void)
41{
42 return (u32)(jiffies - INITIAL_JIFFIES);
43}
44
45static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
46
47static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
48{
49 return (cyc * mult) >> shift;
50}
51
52static unsigned long long notrace sched_clock_32(void)
53{
54 u64 epoch_ns;
55 u32 epoch_cyc;
56 u32 cyc;
57
58 if (cd.suspended)
59 return cd.epoch_ns;
60
61 /*
62 * Load the epoch_cyc and epoch_ns atomically. We do this by
63 * ensuring that we always write epoch_cyc, epoch_ns and
64 * epoch_cyc_copy in strict order, and read them in strict order.
65 * If epoch_cyc and epoch_cyc_copy are not equal, then we're in
66 * the middle of an update, and we should repeat the load.
67 */
68 do {
69 epoch_cyc = cd.epoch_cyc;
70 smp_rmb();
71 epoch_ns = cd.epoch_ns;
72 smp_rmb();
73 } while (epoch_cyc != cd.epoch_cyc_copy);
74
75 cyc = read_sched_clock();
76 cyc = (cyc - epoch_cyc) & sched_clock_mask;
77 return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
78}
79
80/*
81 * Atomically update the sched_clock epoch.
82 */
83static void notrace update_sched_clock(void)
84{
85 unsigned long flags;
86 u32 cyc;
87 u64 ns;
88
89 cyc = read_sched_clock();
90 ns = cd.epoch_ns +
91 cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
92 cd.mult, cd.shift);
93 /*
94 * Write epoch_cyc and epoch_ns in a way that the update is
95 * detectable in cyc_to_fixed_sched_clock().
96 */
97 raw_local_irq_save(flags);
98 cd.epoch_cyc_copy = cyc;
99 smp_wmb();
100 cd.epoch_ns = ns;
101 smp_wmb();
102 cd.epoch_cyc = cyc;
103 raw_local_irq_restore(flags);
104}
105
106static void sched_clock_poll(unsigned long wrap_ticks)
107{
108 mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks));
109 update_sched_clock();
110}
111
112void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
113{
114 unsigned long r, w;
115 u64 res, wrap;
116 char r_unit;
117
118 if (cd.rate > rate)
119 return;
120
121 BUG_ON(bits > 32);
122 WARN_ON(!irqs_disabled());
123 read_sched_clock = read;
124 sched_clock_mask = (1 << bits) - 1;
125 cd.rate = rate;
126
127 /* calculate the mult/shift to convert counter ticks to ns. */
128 clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0);
129
130 r = rate;
131 if (r >= 4000000) {
132 r /= 1000000;
133 r_unit = 'M';
134 } else if (r >= 1000) {
135 r /= 1000;
136 r_unit = 'k';
137 } else
138 r_unit = ' ';
139
140 /* calculate how many ns until we wrap */
141 wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift);
142 do_div(wrap, NSEC_PER_MSEC);
143 w = wrap;
144
145 /* calculate the ns resolution of this counter */
146 res = cyc_to_ns(1ULL, cd.mult, cd.shift);
147 pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n",
148 bits, r, r_unit, res, w);
149
150 /*
151 * Start the timer to keep sched_clock() properly updated and
152 * sets the initial epoch.
153 */
154 sched_clock_timer.data = msecs_to_jiffies(w - (w / 10));
155 update_sched_clock();
156
157 /*
158 * Ensure that sched_clock() starts off at 0ns
159 */
160 cd.epoch_ns = 0;
161
162 /* Enable IRQ time accounting if we have a fast enough sched_clock */
163 if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
164 enable_sched_clock_irqtime();
165
166 pr_debug("Registered %pF as sched_clock source\n", read);
167}
168
169unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32;
170
171unsigned long long notrace sched_clock(void)
172{
173 return sched_clock_func();
174}
175
176void __init sched_clock_postinit(void)
177{
178 /*
179 * If no sched_clock function has been provided at that point,
180 * make it the final one one.
181 */
182 if (read_sched_clock == jiffy_sched_clock_read)
183 setup_sched_clock(jiffy_sched_clock_read, 32, HZ);
184
185 sched_clock_poll(sched_clock_timer.data);
186}
187
188static int sched_clock_suspend(void)
189{
190 sched_clock_poll(sched_clock_timer.data);
191 cd.suspended = true;
192 return 0;
193}
194
195static void sched_clock_resume(void)
196{
197 cd.epoch_cyc = read_sched_clock();
198 cd.epoch_cyc_copy = cd.epoch_cyc;
199 cd.suspended = false;
200}
201
202static struct syscore_ops sched_clock_ops = {
203 .suspend = sched_clock_suspend,
204 .resume = sched_clock_resume,
205};
206
207static int __init sched_clock_syscore_init(void)
208{
209 register_syscore_ops(&sched_clock_ops);
210 return 0;
211}
212device_initcall(sched_clock_syscore_init);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 20d6fba70652..6d3f91631de6 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -19,6 +19,7 @@
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/smp.h> 21#include <linux/smp.h>
22#include <linux/module.h>
22 23
23#include "tick-internal.h" 24#include "tick-internal.h"
24 25
@@ -29,6 +30,7 @@
29 30
30static struct tick_device tick_broadcast_device; 31static struct tick_device tick_broadcast_device;
31static cpumask_var_t tick_broadcast_mask; 32static cpumask_var_t tick_broadcast_mask;
33static cpumask_var_t tick_broadcast_on;
32static cpumask_var_t tmpmask; 34static cpumask_var_t tmpmask;
33static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); 35static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
34static int tick_broadcast_force; 36static int tick_broadcast_force;
@@ -64,17 +66,34 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
64/* 66/*
65 * Check, if the device can be utilized as broadcast device: 67 * Check, if the device can be utilized as broadcast device:
66 */ 68 */
67int tick_check_broadcast_device(struct clock_event_device *dev) 69static bool tick_check_broadcast_device(struct clock_event_device *curdev,
70 struct clock_event_device *newdev)
71{
72 if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
73 (newdev->features & CLOCK_EVT_FEAT_C3STOP))
74 return false;
75
76 if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT &&
77 !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
78 return false;
79
80 return !curdev || newdev->rating > curdev->rating;
81}
82
83/*
84 * Conditionally install/replace broadcast device
85 */
86void tick_install_broadcast_device(struct clock_event_device *dev)
68{ 87{
69 struct clock_event_device *cur = tick_broadcast_device.evtdev; 88 struct clock_event_device *cur = tick_broadcast_device.evtdev;
70 89
71 if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || 90 if (!tick_check_broadcast_device(cur, dev))
72 (tick_broadcast_device.evtdev && 91 return;
73 tick_broadcast_device.evtdev->rating >= dev->rating) ||
74 (dev->features & CLOCK_EVT_FEAT_C3STOP))
75 return 0;
76 92
77 clockevents_exchange_device(tick_broadcast_device.evtdev, dev); 93 if (!try_module_get(dev->owner))
94 return;
95
96 clockevents_exchange_device(cur, dev);
78 if (cur) 97 if (cur)
79 cur->event_handler = clockevents_handle_noop; 98 cur->event_handler = clockevents_handle_noop;
80 tick_broadcast_device.evtdev = dev; 99 tick_broadcast_device.evtdev = dev;
@@ -90,7 +109,6 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
90 */ 109 */
91 if (dev->features & CLOCK_EVT_FEAT_ONESHOT) 110 if (dev->features & CLOCK_EVT_FEAT_ONESHOT)
92 tick_clock_notify(); 111 tick_clock_notify();
93 return 1;
94} 112}
95 113
96/* 114/*
@@ -123,8 +141,9 @@ static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
123 */ 141 */
124int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) 142int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
125{ 143{
144 struct clock_event_device *bc = tick_broadcast_device.evtdev;
126 unsigned long flags; 145 unsigned long flags;
127 int ret = 0; 146 int ret;
128 147
129 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 148 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
130 149
@@ -138,20 +157,59 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
138 dev->event_handler = tick_handle_periodic; 157 dev->event_handler = tick_handle_periodic;
139 tick_device_setup_broadcast_func(dev); 158 tick_device_setup_broadcast_func(dev);
140 cpumask_set_cpu(cpu, tick_broadcast_mask); 159 cpumask_set_cpu(cpu, tick_broadcast_mask);
141 tick_broadcast_start_periodic(tick_broadcast_device.evtdev); 160 tick_broadcast_start_periodic(bc);
142 ret = 1; 161 ret = 1;
143 } else { 162 } else {
144 /* 163 /*
145 * When the new device is not affected by the stop 164 * Clear the broadcast bit for this cpu if the
146 * feature and the cpu is marked in the broadcast mask 165 * device is not power state affected.
147 * then clear the broadcast bit.
148 */ 166 */
149 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { 167 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
150 int cpu = smp_processor_id();
151 cpumask_clear_cpu(cpu, tick_broadcast_mask); 168 cpumask_clear_cpu(cpu, tick_broadcast_mask);
152 tick_broadcast_clear_oneshot(cpu); 169 else
153 } else {
154 tick_device_setup_broadcast_func(dev); 170 tick_device_setup_broadcast_func(dev);
171
172 /*
173 * Clear the broadcast bit if the CPU is not in
174 * periodic broadcast on state.
175 */
176 if (!cpumask_test_cpu(cpu, tick_broadcast_on))
177 cpumask_clear_cpu(cpu, tick_broadcast_mask);
178
179 switch (tick_broadcast_device.mode) {
180 case TICKDEV_MODE_ONESHOT:
181 /*
182 * If the system is in oneshot mode we can
183 * unconditionally clear the oneshot mask bit,
184 * because the CPU is running and therefore
185 * not in an idle state which causes the power
186 * state affected device to stop. Let the
187 * caller initialize the device.
188 */
189 tick_broadcast_clear_oneshot(cpu);
190 ret = 0;
191 break;
192
193 case TICKDEV_MODE_PERIODIC:
194 /*
195 * If the system is in periodic mode, check
196 * whether the broadcast device can be
197 * switched off now.
198 */
199 if (cpumask_empty(tick_broadcast_mask) && bc)
200 clockevents_shutdown(bc);
201 /*
202 * If we kept the cpu in the broadcast mask,
203 * tell the caller to leave the per cpu device
204 * in shutdown state. The periodic interrupt
205 * is delivered by the broadcast device.
206 */
207 ret = cpumask_test_cpu(cpu, tick_broadcast_mask);
208 break;
209 default:
210 /* Nothing to do */
211 ret = 0;
212 break;
155 } 213 }
156 } 214 }
157 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 215 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -281,6 +339,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
281 switch (*reason) { 339 switch (*reason) {
282 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 340 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
283 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 341 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
342 cpumask_set_cpu(cpu, tick_broadcast_on);
284 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { 343 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
285 if (tick_broadcast_device.mode == 344 if (tick_broadcast_device.mode ==
286 TICKDEV_MODE_PERIODIC) 345 TICKDEV_MODE_PERIODIC)
@@ -290,8 +349,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
290 tick_broadcast_force = 1; 349 tick_broadcast_force = 1;
291 break; 350 break;
292 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 351 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
293 if (!tick_broadcast_force && 352 if (tick_broadcast_force)
294 cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { 353 break;
354 cpumask_clear_cpu(cpu, tick_broadcast_on);
355 if (!tick_device_is_functional(dev))
356 break;
357 if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
295 if (tick_broadcast_device.mode == 358 if (tick_broadcast_device.mode ==
296 TICKDEV_MODE_PERIODIC) 359 TICKDEV_MODE_PERIODIC)
297 tick_setup_periodic(dev, 0); 360 tick_setup_periodic(dev, 0);
@@ -349,6 +412,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
349 412
350 bc = tick_broadcast_device.evtdev; 413 bc = tick_broadcast_device.evtdev;
351 cpumask_clear_cpu(cpu, tick_broadcast_mask); 414 cpumask_clear_cpu(cpu, tick_broadcast_mask);
415 cpumask_clear_cpu(cpu, tick_broadcast_on);
352 416
353 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { 417 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
354 if (bc && cpumask_empty(tick_broadcast_mask)) 418 if (bc && cpumask_empty(tick_broadcast_mask))
@@ -475,7 +539,15 @@ void tick_check_oneshot_broadcast(int cpu)
475 if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) { 539 if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) {
476 struct tick_device *td = &per_cpu(tick_cpu_device, cpu); 540 struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
477 541
478 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); 542 /*
543 * We might be in the middle of switching over from
544 * periodic to oneshot. If the CPU has not yet
545 * switched over, leave the device alone.
546 */
547 if (td->mode == TICKDEV_MODE_ONESHOT) {
548 clockevents_set_mode(td->evtdev,
549 CLOCK_EVT_MODE_ONESHOT);
550 }
479 } 551 }
480} 552}
481 553
@@ -522,6 +594,13 @@ again:
522 cpumask_clear(tick_broadcast_force_mask); 594 cpumask_clear(tick_broadcast_force_mask);
523 595
524 /* 596 /*
597 * Sanity check. Catch the case where we try to broadcast to
598 * offline cpus.
599 */
600 if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask)))
601 cpumask_and(tmpmask, tmpmask, cpu_online_mask);
602
603 /*
525 * Wakeup the cpus which have an expired event. 604 * Wakeup the cpus which have an expired event.
526 */ 605 */
527 tick_do_broadcast(tmpmask); 606 tick_do_broadcast(tmpmask);
@@ -761,10 +840,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
761 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 840 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
762 841
763 /* 842 /*
764 * Clear the broadcast mask flag for the dead cpu, but do not 843 * Clear the broadcast masks for the dead cpu, but do not stop
765 * stop the broadcast device! 844 * the broadcast device!
766 */ 845 */
767 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); 846 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
847 cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
848 cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
768 849
769 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 850 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
770} 851}
@@ -792,6 +873,7 @@ bool tick_broadcast_oneshot_available(void)
792void __init tick_broadcast_init(void) 873void __init tick_broadcast_init(void)
793{ 874{
794 zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); 875 zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
876 zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT);
795 zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); 877 zalloc_cpumask_var(&tmpmask, GFP_NOWAIT);
796#ifdef CONFIG_TICK_ONESHOT 878#ifdef CONFIG_TICK_ONESHOT
797 zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); 879 zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 5d3fb100bc06..64522ecdfe0e 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -18,6 +18,7 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/module.h>
21 22
22#include <asm/irq_regs.h> 23#include <asm/irq_regs.h>
23 24
@@ -33,7 +34,6 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
33ktime_t tick_next_period; 34ktime_t tick_next_period;
34ktime_t tick_period; 35ktime_t tick_period;
35int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; 36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
36static DEFINE_RAW_SPINLOCK(tick_device_lock);
37 37
38/* 38/*
39 * Debugging: see timer_list.c 39 * Debugging: see timer_list.c
@@ -194,7 +194,8 @@ static void tick_setup_device(struct tick_device *td,
194 * When global broadcasting is active, check if the current 194 * When global broadcasting is active, check if the current
195 * device is registered as a placeholder for broadcast mode. 195 * device is registered as a placeholder for broadcast mode.
196 * This allows us to handle this x86 misfeature in a generic 196 * This allows us to handle this x86 misfeature in a generic
197 * way. 197 * way. This function also returns !=0 when we keep the
198 * current active broadcast state for this CPU.
198 */ 199 */
199 if (tick_device_uses_broadcast(newdev, cpu)) 200 if (tick_device_uses_broadcast(newdev, cpu))
200 return; 201 return;
@@ -205,17 +206,75 @@ static void tick_setup_device(struct tick_device *td,
205 tick_setup_oneshot(newdev, handler, next_event); 206 tick_setup_oneshot(newdev, handler, next_event);
206} 207}
207 208
209void tick_install_replacement(struct clock_event_device *newdev)
210{
211 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
212 int cpu = smp_processor_id();
213
214 clockevents_exchange_device(td->evtdev, newdev);
215 tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
216 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
217 tick_oneshot_notify();
218}
219
220static bool tick_check_percpu(struct clock_event_device *curdev,
221 struct clock_event_device *newdev, int cpu)
222{
223 if (!cpumask_test_cpu(cpu, newdev->cpumask))
224 return false;
225 if (cpumask_equal(newdev->cpumask, cpumask_of(cpu)))
226 return true;
227 /* Check if irq affinity can be set */
228 if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq))
229 return false;
230 /* Prefer an existing cpu local device */
231 if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
232 return false;
233 return true;
234}
235
236static bool tick_check_preferred(struct clock_event_device *curdev,
237 struct clock_event_device *newdev)
238{
239 /* Prefer oneshot capable device */
240 if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) {
241 if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT))
242 return false;
243 if (tick_oneshot_mode_active())
244 return false;
245 }
246
247 /*
248 * Use the higher rated one, but prefer a CPU local device with a lower
249 * rating than a non-CPU local device
250 */
251 return !curdev ||
252 newdev->rating > curdev->rating ||
253 !cpumask_equal(curdev->cpumask, newdev->cpumask);
254}
255
256/*
257 * Check whether the new device is a better fit than curdev. curdev
258 * can be NULL !
259 */
260bool tick_check_replacement(struct clock_event_device *curdev,
261 struct clock_event_device *newdev)
262{
263 if (tick_check_percpu(curdev, newdev, smp_processor_id()))
264 return false;
265
266 return tick_check_preferred(curdev, newdev);
267}
268
208/* 269/*
209 * Check, if the new registered device should be used. 270 * Check, if the new registered device should be used. Called with
271 * clockevents_lock held and interrupts disabled.
210 */ 272 */
211static int tick_check_new_device(struct clock_event_device *newdev) 273void tick_check_new_device(struct clock_event_device *newdev)
212{ 274{
213 struct clock_event_device *curdev; 275 struct clock_event_device *curdev;
214 struct tick_device *td; 276 struct tick_device *td;
215 int cpu, ret = NOTIFY_OK; 277 int cpu;
216 unsigned long flags;
217
218 raw_spin_lock_irqsave(&tick_device_lock, flags);
219 278
220 cpu = smp_processor_id(); 279 cpu = smp_processor_id();
221 if (!cpumask_test_cpu(cpu, newdev->cpumask)) 280 if (!cpumask_test_cpu(cpu, newdev->cpumask))
@@ -225,40 +284,15 @@ static int tick_check_new_device(struct clock_event_device *newdev)
225 curdev = td->evtdev; 284 curdev = td->evtdev;
226 285
227 /* cpu local device ? */ 286 /* cpu local device ? */
228 if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { 287 if (!tick_check_percpu(curdev, newdev, cpu))
229 288 goto out_bc;
230 /*
231 * If the cpu affinity of the device interrupt can not
232 * be set, ignore it.
233 */
234 if (!irq_can_set_affinity(newdev->irq))
235 goto out_bc;
236 289
237 /* 290 /* Preference decision */
238 * If we have a cpu local device already, do not replace it 291 if (!tick_check_preferred(curdev, newdev))
239 * by a non cpu local device 292 goto out_bc;
240 */
241 if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
242 goto out_bc;
243 }
244 293
245 /* 294 if (!try_module_get(newdev->owner))
246 * If we have an active device, then check the rating and the oneshot 295 return;
247 * feature.
248 */
249 if (curdev) {
250 /*
251 * Prefer one shot capable devices !
252 */
253 if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
254 !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
255 goto out_bc;
256 /*
257 * Check the rating
258 */
259 if (curdev->rating >= newdev->rating)
260 goto out_bc;
261 }
262 296
263 /* 297 /*
264 * Replace the eventually existing device by the new 298 * Replace the eventually existing device by the new
@@ -273,20 +307,13 @@ static int tick_check_new_device(struct clock_event_device *newdev)
273 tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); 307 tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
274 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) 308 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
275 tick_oneshot_notify(); 309 tick_oneshot_notify();
276 310 return;
277 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
278 return NOTIFY_STOP;
279 311
280out_bc: 312out_bc:
281 /* 313 /*
282 * Can the new device be used as a broadcast device ? 314 * Can the new device be used as a broadcast device ?
283 */ 315 */
284 if (tick_check_broadcast_device(newdev)) 316 tick_install_broadcast_device(newdev);
285 ret = NOTIFY_STOP;
286
287 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
288
289 return ret;
290} 317}
291 318
292/* 319/*
@@ -294,7 +321,7 @@ out_bc:
294 * 321 *
295 * Called with interrupts disabled. 322 * Called with interrupts disabled.
296 */ 323 */
297static void tick_handover_do_timer(int *cpup) 324void tick_handover_do_timer(int *cpup)
298{ 325{
299 if (*cpup == tick_do_timer_cpu) { 326 if (*cpup == tick_do_timer_cpu) {
300 int cpu = cpumask_first(cpu_online_mask); 327 int cpu = cpumask_first(cpu_online_mask);
@@ -311,13 +338,11 @@ static void tick_handover_do_timer(int *cpup)
311 * access the hardware device itself. 338 * access the hardware device itself.
312 * We just set the mode and remove it from the lists. 339 * We just set the mode and remove it from the lists.
313 */ 340 */
314static void tick_shutdown(unsigned int *cpup) 341void tick_shutdown(unsigned int *cpup)
315{ 342{
316 struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); 343 struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
317 struct clock_event_device *dev = td->evtdev; 344 struct clock_event_device *dev = td->evtdev;
318 unsigned long flags;
319 345
320 raw_spin_lock_irqsave(&tick_device_lock, flags);
321 td->mode = TICKDEV_MODE_PERIODIC; 346 td->mode = TICKDEV_MODE_PERIODIC;
322 if (dev) { 347 if (dev) {
323 /* 348 /*
@@ -329,26 +354,20 @@ static void tick_shutdown(unsigned int *cpup)
329 dev->event_handler = clockevents_handle_noop; 354 dev->event_handler = clockevents_handle_noop;
330 td->evtdev = NULL; 355 td->evtdev = NULL;
331 } 356 }
332 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
333} 357}
334 358
335static void tick_suspend(void) 359void tick_suspend(void)
336{ 360{
337 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 361 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
338 unsigned long flags;
339 362
340 raw_spin_lock_irqsave(&tick_device_lock, flags);
341 clockevents_shutdown(td->evtdev); 363 clockevents_shutdown(td->evtdev);
342 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
343} 364}
344 365
345static void tick_resume(void) 366void tick_resume(void)
346{ 367{
347 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 368 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
348 unsigned long flags;
349 int broadcast = tick_resume_broadcast(); 369 int broadcast = tick_resume_broadcast();
350 370
351 raw_spin_lock_irqsave(&tick_device_lock, flags);
352 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); 371 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
353 372
354 if (!broadcast) { 373 if (!broadcast) {
@@ -357,68 +376,12 @@ static void tick_resume(void)
357 else 376 else
358 tick_resume_oneshot(); 377 tick_resume_oneshot();
359 } 378 }
360 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
361} 379}
362 380
363/*
364 * Notification about clock event devices
365 */
366static int tick_notify(struct notifier_block *nb, unsigned long reason,
367 void *dev)
368{
369 switch (reason) {
370
371 case CLOCK_EVT_NOTIFY_ADD:
372 return tick_check_new_device(dev);
373
374 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
375 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
376 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
377 tick_broadcast_on_off(reason, dev);
378 break;
379
380 case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
381 case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
382 tick_broadcast_oneshot_control(reason);
383 break;
384
385 case CLOCK_EVT_NOTIFY_CPU_DYING:
386 tick_handover_do_timer(dev);
387 break;
388
389 case CLOCK_EVT_NOTIFY_CPU_DEAD:
390 tick_shutdown_broadcast_oneshot(dev);
391 tick_shutdown_broadcast(dev);
392 tick_shutdown(dev);
393 break;
394
395 case CLOCK_EVT_NOTIFY_SUSPEND:
396 tick_suspend();
397 tick_suspend_broadcast();
398 break;
399
400 case CLOCK_EVT_NOTIFY_RESUME:
401 tick_resume();
402 break;
403
404 default:
405 break;
406 }
407
408 return NOTIFY_OK;
409}
410
411static struct notifier_block tick_notifier = {
412 .notifier_call = tick_notify,
413};
414
415/** 381/**
416 * tick_init - initialize the tick control 382 * tick_init - initialize the tick control
417 *
418 * Register the notifier with the clockevents framework
419 */ 383 */
420void __init tick_init(void) 384void __init tick_init(void)
421{ 385{
422 clockevents_register_notifier(&tick_notifier);
423 tick_broadcast_init(); 386 tick_broadcast_init();
424} 387}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f0299eae4602..bc906cad709b 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -6,6 +6,8 @@
6 6
7extern seqlock_t jiffies_lock; 7extern seqlock_t jiffies_lock;
8 8
9#define CS_NAME_LEN 32
10
9#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD 11#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
10 12
11#define TICK_DO_TIMER_NONE -1 13#define TICK_DO_TIMER_NONE -1
@@ -18,9 +20,19 @@ extern int tick_do_timer_cpu __read_mostly;
18 20
19extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); 21extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
20extern void tick_handle_periodic(struct clock_event_device *dev); 22extern void tick_handle_periodic(struct clock_event_device *dev);
23extern void tick_check_new_device(struct clock_event_device *dev);
24extern void tick_handover_do_timer(int *cpup);
25extern void tick_shutdown(unsigned int *cpup);
26extern void tick_suspend(void);
27extern void tick_resume(void);
28extern bool tick_check_replacement(struct clock_event_device *curdev,
29 struct clock_event_device *newdev);
30extern void tick_install_replacement(struct clock_event_device *dev);
21 31
22extern void clockevents_shutdown(struct clock_event_device *dev); 32extern void clockevents_shutdown(struct clock_event_device *dev);
23 33
34extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
35
24/* 36/*
25 * NO_HZ / high resolution timer shared code 37 * NO_HZ / high resolution timer shared code
26 */ 38 */
@@ -90,7 +102,7 @@ static inline bool tick_broadcast_oneshot_available(void) { return false; }
90 */ 102 */
91#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 103#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
92extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); 104extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
93extern int tick_check_broadcast_device(struct clock_event_device *dev); 105extern void tick_install_broadcast_device(struct clock_event_device *dev);
94extern int tick_is_broadcast_device(struct clock_event_device *dev); 106extern int tick_is_broadcast_device(struct clock_event_device *dev);
95extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); 107extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
96extern void tick_shutdown_broadcast(unsigned int *cpup); 108extern void tick_shutdown_broadcast(unsigned int *cpup);
@@ -102,9 +114,8 @@ tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
102 114
103#else /* !BROADCAST */ 115#else /* !BROADCAST */
104 116
105static inline int tick_check_broadcast_device(struct clock_event_device *dev) 117static inline void tick_install_broadcast_device(struct clock_event_device *dev)
106{ 118{
107 return 0;
108} 119}
109 120
110static inline int tick_is_broadcast_device(struct clock_event_device *dev) 121static inline int tick_is_broadcast_device(struct clock_event_device *dev)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index baeeb5c87cf1..48b9fffabdc2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -25,6 +25,11 @@
25 25
26#include "tick-internal.h" 26#include "tick-internal.h"
27#include "ntp_internal.h" 27#include "ntp_internal.h"
28#include "timekeeping_internal.h"
29
30#define TK_CLEAR_NTP (1 << 0)
31#define TK_MIRROR (1 << 1)
32#define TK_CLOCK_WAS_SET (1 << 2)
28 33
29static struct timekeeper timekeeper; 34static struct timekeeper timekeeper;
30static DEFINE_RAW_SPINLOCK(timekeeper_lock); 35static DEFINE_RAW_SPINLOCK(timekeeper_lock);
@@ -200,9 +205,9 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
200 205
201static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); 206static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
202 207
203static void update_pvclock_gtod(struct timekeeper *tk) 208static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
204{ 209{
205 raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk); 210 raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk);
206} 211}
207 212
208/** 213/**
@@ -216,7 +221,7 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb)
216 221
217 raw_spin_lock_irqsave(&timekeeper_lock, flags); 222 raw_spin_lock_irqsave(&timekeeper_lock, flags);
218 ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); 223 ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
219 update_pvclock_gtod(tk); 224 update_pvclock_gtod(tk, true);
220 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 225 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
221 226
222 return ret; 227 return ret;
@@ -241,16 +246,16 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
241EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); 246EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
242 247
243/* must hold timekeeper_lock */ 248/* must hold timekeeper_lock */
244static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror) 249static void timekeeping_update(struct timekeeper *tk, unsigned int action)
245{ 250{
246 if (clearntp) { 251 if (action & TK_CLEAR_NTP) {
247 tk->ntp_error = 0; 252 tk->ntp_error = 0;
248 ntp_clear(); 253 ntp_clear();
249 } 254 }
250 update_vsyscall(tk); 255 update_vsyscall(tk);
251 update_pvclock_gtod(tk); 256 update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
252 257
253 if (mirror) 258 if (action & TK_MIRROR)
254 memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); 259 memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
255} 260}
256 261
@@ -508,7 +513,7 @@ int do_settimeofday(const struct timespec *tv)
508 513
509 tk_set_xtime(tk, tv); 514 tk_set_xtime(tk, tv);
510 515
511 timekeeping_update(tk, true, true); 516 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
512 517
513 write_seqcount_end(&timekeeper_seq); 518 write_seqcount_end(&timekeeper_seq);
514 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 519 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -552,7 +557,7 @@ int timekeeping_inject_offset(struct timespec *ts)
552 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); 557 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
553 558
554error: /* even if we error out, we forwarded the time, so call update */ 559error: /* even if we error out, we forwarded the time, so call update */
555 timekeeping_update(tk, true, true); 560 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
556 561
557 write_seqcount_end(&timekeeper_seq); 562 write_seqcount_end(&timekeeper_seq);
558 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 563 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -627,13 +632,22 @@ static int change_clocksource(void *data)
627 write_seqcount_begin(&timekeeper_seq); 632 write_seqcount_begin(&timekeeper_seq);
628 633
629 timekeeping_forward_now(tk); 634 timekeeping_forward_now(tk);
630 if (!new->enable || new->enable(new) == 0) { 635 /*
631 old = tk->clock; 636 * If the cs is in module, get a module reference. Succeeds
632 tk_setup_internals(tk, new); 637 * for built-in code (owner == NULL) as well.
633 if (old->disable) 638 */
634 old->disable(old); 639 if (try_module_get(new->owner)) {
640 if (!new->enable || new->enable(new) == 0) {
641 old = tk->clock;
642 tk_setup_internals(tk, new);
643 if (old->disable)
644 old->disable(old);
645 module_put(old->owner);
646 } else {
647 module_put(new->owner);
648 }
635 } 649 }
636 timekeeping_update(tk, true, true); 650 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
637 651
638 write_seqcount_end(&timekeeper_seq); 652 write_seqcount_end(&timekeeper_seq);
639 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 653 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -648,14 +662,15 @@ static int change_clocksource(void *data)
648 * This function is called from clocksource.c after a new, better clock 662 * This function is called from clocksource.c after a new, better clock
649 * source has been registered. The caller holds the clocksource_mutex. 663 * source has been registered. The caller holds the clocksource_mutex.
650 */ 664 */
651void timekeeping_notify(struct clocksource *clock) 665int timekeeping_notify(struct clocksource *clock)
652{ 666{
653 struct timekeeper *tk = &timekeeper; 667 struct timekeeper *tk = &timekeeper;
654 668
655 if (tk->clock == clock) 669 if (tk->clock == clock)
656 return; 670 return 0;
657 stop_machine(change_clocksource, clock, NULL); 671 stop_machine(change_clocksource, clock, NULL);
658 tick_clock_notify(); 672 tick_clock_notify();
673 return tk->clock == clock ? 0 : -1;
659} 674}
660 675
661/** 676/**
@@ -841,6 +856,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
841 tk_xtime_add(tk, delta); 856 tk_xtime_add(tk, delta);
842 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); 857 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta));
843 tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); 858 tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta));
859 tk_debug_account_sleep_time(delta);
844} 860}
845 861
846/** 862/**
@@ -872,7 +888,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
872 888
873 __timekeeping_inject_sleeptime(tk, delta); 889 __timekeeping_inject_sleeptime(tk, delta);
874 890
875 timekeeping_update(tk, true, true); 891 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
876 892
877 write_seqcount_end(&timekeeper_seq); 893 write_seqcount_end(&timekeeper_seq);
878 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 894 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -954,7 +970,7 @@ static void timekeeping_resume(void)
954 tk->cycle_last = clock->cycle_last = cycle_now; 970 tk->cycle_last = clock->cycle_last = cycle_now;
955 tk->ntp_error = 0; 971 tk->ntp_error = 0;
956 timekeeping_suspended = 0; 972 timekeeping_suspended = 0;
957 timekeeping_update(tk, false, true); 973 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
958 write_seqcount_end(&timekeeper_seq); 974 write_seqcount_end(&timekeeper_seq);
959 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 975 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
960 976
@@ -1236,9 +1252,10 @@ out_adjust:
1236 * It also calls into the NTP code to handle leapsecond processing. 1252 * It also calls into the NTP code to handle leapsecond processing.
1237 * 1253 *
1238 */ 1254 */
1239static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) 1255static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
1240{ 1256{
1241 u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; 1257 u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
1258 unsigned int action = 0;
1242 1259
1243 while (tk->xtime_nsec >= nsecps) { 1260 while (tk->xtime_nsec >= nsecps) {
1244 int leap; 1261 int leap;
@@ -1261,8 +1278,10 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
1261 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); 1278 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
1262 1279
1263 clock_was_set_delayed(); 1280 clock_was_set_delayed();
1281 action = TK_CLOCK_WAS_SET;
1264 } 1282 }
1265 } 1283 }
1284 return action;
1266} 1285}
1267 1286
1268/** 1287/**
@@ -1347,6 +1366,7 @@ static void update_wall_time(void)
1347 struct timekeeper *tk = &shadow_timekeeper; 1366 struct timekeeper *tk = &shadow_timekeeper;
1348 cycle_t offset; 1367 cycle_t offset;
1349 int shift = 0, maxshift; 1368 int shift = 0, maxshift;
1369 unsigned int action;
1350 unsigned long flags; 1370 unsigned long flags;
1351 1371
1352 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1372 raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -1399,7 +1419,7 @@ static void update_wall_time(void)
1399 * Finally, make sure that after the rounding 1419 * Finally, make sure that after the rounding
1400 * xtime_nsec isn't larger than NSEC_PER_SEC 1420 * xtime_nsec isn't larger than NSEC_PER_SEC
1401 */ 1421 */
1402 accumulate_nsecs_to_secs(tk); 1422 action = accumulate_nsecs_to_secs(tk);
1403 1423
1404 write_seqcount_begin(&timekeeper_seq); 1424 write_seqcount_begin(&timekeeper_seq);
1405 /* Update clock->cycle_last with the new value */ 1425 /* Update clock->cycle_last with the new value */
@@ -1415,7 +1435,7 @@ static void update_wall_time(void)
1415 * updating. 1435 * updating.
1416 */ 1436 */
1417 memcpy(real_tk, tk, sizeof(*tk)); 1437 memcpy(real_tk, tk, sizeof(*tk));
1418 timekeeping_update(real_tk, false, false); 1438 timekeeping_update(real_tk, action);
1419 write_seqcount_end(&timekeeper_seq); 1439 write_seqcount_end(&timekeeper_seq);
1420out: 1440out:
1421 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1441 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -1677,6 +1697,7 @@ int do_adjtimex(struct timex *txc)
1677 1697
1678 if (tai != orig_tai) { 1698 if (tai != orig_tai) {
1679 __timekeeping_set_tai_offset(tk, tai); 1699 __timekeeping_set_tai_offset(tk, tai);
1700 update_pvclock_gtod(tk, true);
1680 clock_was_set_delayed(); 1701 clock_was_set_delayed();
1681 } 1702 }
1682 write_seqcount_end(&timekeeper_seq); 1703 write_seqcount_end(&timekeeper_seq);
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
new file mode 100644
index 000000000000..802433a4f5eb
--- /dev/null
+++ b/kernel/time/timekeeping_debug.c
@@ -0,0 +1,72 @@
1/*
2 * debugfs file to track time spent in suspend
3 *
4 * Copyright (c) 2011, Google, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 */
16
17#include <linux/debugfs.h>
18#include <linux/err.h>
19#include <linux/init.h>
20#include <linux/kernel.h>
21#include <linux/seq_file.h>
22#include <linux/time.h>
23
24static unsigned int sleep_time_bin[32] = {0};
25
26static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
27{
28 unsigned int bin;
29 seq_puts(s, " time (secs) count\n");
30 seq_puts(s, "------------------------------\n");
31 for (bin = 0; bin < 32; bin++) {
32 if (sleep_time_bin[bin] == 0)
33 continue;
34 seq_printf(s, "%10u - %-10u %4u\n",
35 bin ? 1 << (bin - 1) : 0, 1 << bin,
36 sleep_time_bin[bin]);
37 }
38 return 0;
39}
40
41static int tk_debug_sleep_time_open(struct inode *inode, struct file *file)
42{
43 return single_open(file, tk_debug_show_sleep_time, NULL);
44}
45
46static const struct file_operations tk_debug_sleep_time_fops = {
47 .open = tk_debug_sleep_time_open,
48 .read = seq_read,
49 .llseek = seq_lseek,
50 .release = single_release,
51};
52
53static int __init tk_debug_sleep_time_init(void)
54{
55 struct dentry *d;
56
57 d = debugfs_create_file("sleep_time", 0444, NULL, NULL,
58 &tk_debug_sleep_time_fops);
59 if (!d) {
60 pr_err("Failed to create sleep_time debug file\n");
61 return -ENOMEM;
62 }
63
64 return 0;
65}
66late_initcall(tk_debug_sleep_time_init);
67
68void tk_debug_account_sleep_time(struct timespec *t)
69{
70 sleep_time_bin[fls(t->tv_sec)]++;
71}
72
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
new file mode 100644
index 000000000000..13323ea08ffa
--- /dev/null
+++ b/kernel/time/timekeeping_internal.h
@@ -0,0 +1,14 @@
1#ifndef _TIMEKEEPING_INTERNAL_H
2#define _TIMEKEEPING_INTERNAL_H
3/*
4 * timekeeping debug functions
5 */
6#include <linux/time.h>
7
8#ifdef CONFIG_DEBUG_FS
9extern void tk_debug_account_sleep_time(struct timespec *t);
10#else
11#define tk_debug_account_sleep_time(x)
12#endif
13
14#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/timer.c b/kernel/timer.c
index 15ffdb3f1948..15bc1b41021d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -149,9 +149,11 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu,
149 /* now that we have rounded, subtract the extra skew again */ 149 /* now that we have rounded, subtract the extra skew again */
150 j -= cpu * 3; 150 j -= cpu * 3;
151 151
152 if (j <= jiffies) /* rounding ate our timeout entirely; */ 152 /*
153 return original; 153 * Make sure j is still in the future. Otherwise return the
154 return j; 154 * unmodified value.
155 */
156 return time_is_after_jiffies(j) ? j : original;
155} 157}
156 158
157/** 159/**
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6c508ff33c62..67708f46baae 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -413,6 +413,17 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
413 return 0; 413 return 0;
414} 414}
415 415
416static void ftrace_sync(struct work_struct *work)
417{
418 /*
419 * This function is just a stub to implement a hard force
420 * of synchronize_sched(). This requires synchronizing
421 * tasks even in userspace and idle.
422 *
423 * Yes, function tracing is rude.
424 */
425}
426
416static int __unregister_ftrace_function(struct ftrace_ops *ops) 427static int __unregister_ftrace_function(struct ftrace_ops *ops)
417{ 428{
418 int ret; 429 int ret;
@@ -440,8 +451,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
440 * so there'll be no new users. We must ensure 451 * so there'll be no new users. We must ensure
441 * all current users are done before we free 452 * all current users are done before we free
442 * the control data. 453 * the control data.
454 * Note synchronize_sched() is not enough, as we
455 * use preempt_disable() to do RCU, but the function
456 * tracer can be called where RCU is not active
457 * (before user_exit()).
443 */ 458 */
444 synchronize_sched(); 459 schedule_on_each_cpu(ftrace_sync);
445 control_ops_free(ops); 460 control_ops_free(ops);
446 } 461 }
447 } else 462 } else
@@ -456,9 +471,13 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
456 /* 471 /*
457 * Dynamic ops may be freed, we must make sure that all 472 * Dynamic ops may be freed, we must make sure that all
458 * callers are done before leaving this function. 473 * callers are done before leaving this function.
474 *
475 * Again, normal synchronize_sched() is not good enough.
476 * We need to do a hard force of sched synchronization.
459 */ 477 */
460 if (ops->flags & FTRACE_OPS_FL_DYNAMIC) 478 if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
461 synchronize_sched(); 479 schedule_on_each_cpu(ftrace_sync);
480
462 481
463 return 0; 482 return 0;
464} 483}
@@ -622,12 +641,18 @@ static int function_stat_show(struct seq_file *m, void *v)
622 if (rec->counter <= 1) 641 if (rec->counter <= 1)
623 stddev = 0; 642 stddev = 0;
624 else { 643 else {
625 stddev = rec->time_squared - rec->counter * avg * avg; 644 /*
645 * Apply Welford's method:
646 * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
647 */
648 stddev = rec->counter * rec->time_squared -
649 rec->time * rec->time;
650
626 /* 651 /*
627 * Divide only 1000 for ns^2 -> us^2 conversion. 652 * Divide only 1000 for ns^2 -> us^2 conversion.
628 * trace_print_graph_duration will divide 1000 again. 653 * trace_print_graph_duration will divide 1000 again.
629 */ 654 */
630 do_div(stddev, (rec->counter - 1) * 1000); 655 do_div(stddev, rec->counter * (rec->counter - 1) * 1000);
631 } 656 }
632 657
633 trace_seq_init(&s); 658 trace_seq_init(&s);
@@ -3512,8 +3537,12 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);
3512static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; 3537static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
3513static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; 3538static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
3514 3539
3540/* Used by function selftest to not test if filter is set */
3541bool ftrace_filter_param __initdata;
3542
3515static int __init set_ftrace_notrace(char *str) 3543static int __init set_ftrace_notrace(char *str)
3516{ 3544{
3545 ftrace_filter_param = true;
3517 strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); 3546 strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
3518 return 1; 3547 return 1;
3519} 3548}
@@ -3521,6 +3550,7 @@ __setup("ftrace_notrace=", set_ftrace_notrace);
3521 3550
3522static int __init set_ftrace_filter(char *str) 3551static int __init set_ftrace_filter(char *str)
3523{ 3552{
3553 ftrace_filter_param = true;
3524 strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); 3554 strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
3525 return 1; 3555 return 1;
3526} 3556}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e71a8be4a6ee..0cd500bffd9b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -115,6 +115,9 @@ cpumask_var_t __read_mostly tracing_buffer_mask;
115 115
116enum ftrace_dump_mode ftrace_dump_on_oops; 116enum ftrace_dump_mode ftrace_dump_on_oops;
117 117
118/* When set, tracing will stop when a WARN*() is hit */
119int __disable_trace_on_warning;
120
118static int tracing_set_tracer(const char *buf); 121static int tracing_set_tracer(const char *buf);
119 122
120#define MAX_TRACER_SIZE 100 123#define MAX_TRACER_SIZE 100
@@ -149,6 +152,13 @@ static int __init set_ftrace_dump_on_oops(char *str)
149} 152}
150__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 153__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
151 154
155static int __init stop_trace_on_warning(char *str)
156{
157 __disable_trace_on_warning = 1;
158 return 1;
159}
160__setup("traceoff_on_warning=", stop_trace_on_warning);
161
152static int __init boot_alloc_snapshot(char *str) 162static int __init boot_alloc_snapshot(char *str)
153{ 163{
154 allocate_snapshot = true; 164 allocate_snapshot = true;
@@ -170,6 +180,7 @@ static int __init set_trace_boot_options(char *str)
170} 180}
171__setup("trace_options=", set_trace_boot_options); 181__setup("trace_options=", set_trace_boot_options);
172 182
183
173unsigned long long ns2usecs(cycle_t nsec) 184unsigned long long ns2usecs(cycle_t nsec)
174{ 185{
175 nsec += 500; 186 nsec += 500;
@@ -193,6 +204,37 @@ static struct trace_array global_trace;
193 204
194LIST_HEAD(ftrace_trace_arrays); 205LIST_HEAD(ftrace_trace_arrays);
195 206
207int trace_array_get(struct trace_array *this_tr)
208{
209 struct trace_array *tr;
210 int ret = -ENODEV;
211
212 mutex_lock(&trace_types_lock);
213 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
214 if (tr == this_tr) {
215 tr->ref++;
216 ret = 0;
217 break;
218 }
219 }
220 mutex_unlock(&trace_types_lock);
221
222 return ret;
223}
224
225static void __trace_array_put(struct trace_array *this_tr)
226{
227 WARN_ON(!this_tr->ref);
228 this_tr->ref--;
229}
230
231void trace_array_put(struct trace_array *this_tr)
232{
233 mutex_lock(&trace_types_lock);
234 __trace_array_put(this_tr);
235 mutex_unlock(&trace_types_lock);
236}
237
196int filter_current_check_discard(struct ring_buffer *buffer, 238int filter_current_check_discard(struct ring_buffer *buffer,
197 struct ftrace_event_call *call, void *rec, 239 struct ftrace_event_call *call, void *rec,
198 struct ring_buffer_event *event) 240 struct ring_buffer_event *event)
@@ -215,9 +257,24 @@ cycle_t ftrace_now(int cpu)
215 return ts; 257 return ts;
216} 258}
217 259
260/**
261 * tracing_is_enabled - Show if global_trace has been disabled
262 *
263 * Shows if the global trace has been enabled or not. It uses the
264 * mirror flag "buffer_disabled" to be used in fast paths such as for
265 * the irqsoff tracer. But it may be inaccurate due to races. If you
266 * need to know the accurate state, use tracing_is_on() which is a little
267 * slower, but accurate.
268 */
218int tracing_is_enabled(void) 269int tracing_is_enabled(void)
219{ 270{
220 return tracing_is_on(); 271 /*
272 * For quick access (irqsoff uses this in fast path), just
273 * return the mirror variable of the state of the ring buffer.
274 * It's a little racy, but we don't really care.
275 */
276 smp_rmb();
277 return !global_trace.buffer_disabled;
221} 278}
222 279
223/* 280/*
@@ -240,7 +297,7 @@ static struct tracer *trace_types __read_mostly;
240/* 297/*
241 * trace_types_lock is used to protect the trace_types list. 298 * trace_types_lock is used to protect the trace_types list.
242 */ 299 */
243static DEFINE_MUTEX(trace_types_lock); 300DEFINE_MUTEX(trace_types_lock);
244 301
245/* 302/*
246 * serialize the access of the ring buffer 303 * serialize the access of the ring buffer
@@ -330,6 +387,23 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
330 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | 387 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
331 TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; 388 TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION;
332 389
390static void tracer_tracing_on(struct trace_array *tr)
391{
392 if (tr->trace_buffer.buffer)
393 ring_buffer_record_on(tr->trace_buffer.buffer);
394 /*
395 * This flag is looked at when buffers haven't been allocated
396 * yet, or by some tracers (like irqsoff), that just want to
397 * know if the ring buffer has been disabled, but it can handle
398 * races of where it gets disabled but we still do a record.
399 * As the check is in the fast path of the tracers, it is more
400 * important to be fast than accurate.
401 */
402 tr->buffer_disabled = 0;
403 /* Make the flag seen by readers */
404 smp_wmb();
405}
406
333/** 407/**
334 * tracing_on - enable tracing buffers 408 * tracing_on - enable tracing buffers
335 * 409 *
@@ -338,15 +412,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
338 */ 412 */
339void tracing_on(void) 413void tracing_on(void)
340{ 414{
341 if (global_trace.trace_buffer.buffer) 415 tracer_tracing_on(&global_trace);
342 ring_buffer_record_on(global_trace.trace_buffer.buffer);
343 /*
344 * This flag is only looked at when buffers haven't been
345 * allocated yet. We don't really care about the race
346 * between setting this flag and actually turning
347 * on the buffer.
348 */
349 global_trace.buffer_disabled = 0;
350} 416}
351EXPORT_SYMBOL_GPL(tracing_on); 417EXPORT_SYMBOL_GPL(tracing_on);
352 418
@@ -540,6 +606,23 @@ void tracing_snapshot_alloc(void)
540EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); 606EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
541#endif /* CONFIG_TRACER_SNAPSHOT */ 607#endif /* CONFIG_TRACER_SNAPSHOT */
542 608
609static void tracer_tracing_off(struct trace_array *tr)
610{
611 if (tr->trace_buffer.buffer)
612 ring_buffer_record_off(tr->trace_buffer.buffer);
613 /*
614 * This flag is looked at when buffers haven't been allocated
615 * yet, or by some tracers (like irqsoff), that just want to
616 * know if the ring buffer has been disabled, but it can handle
617 * races of where it gets disabled but we still do a record.
618 * As the check is in the fast path of the tracers, it is more
619 * important to be fast than accurate.
620 */
621 tr->buffer_disabled = 1;
622 /* Make the flag seen by readers */
623 smp_wmb();
624}
625
543/** 626/**
544 * tracing_off - turn off tracing buffers 627 * tracing_off - turn off tracing buffers
545 * 628 *
@@ -550,26 +633,35 @@ EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
550 */ 633 */
551void tracing_off(void) 634void tracing_off(void)
552{ 635{
553 if (global_trace.trace_buffer.buffer) 636 tracer_tracing_off(&global_trace);
554 ring_buffer_record_off(global_trace.trace_buffer.buffer);
555 /*
556 * This flag is only looked at when buffers haven't been
557 * allocated yet. We don't really care about the race
558 * between setting this flag and actually turning
559 * on the buffer.
560 */
561 global_trace.buffer_disabled = 1;
562} 637}
563EXPORT_SYMBOL_GPL(tracing_off); 638EXPORT_SYMBOL_GPL(tracing_off);
564 639
640void disable_trace_on_warning(void)
641{
642 if (__disable_trace_on_warning)
643 tracing_off();
644}
645
646/**
647 * tracer_tracing_is_on - show real state of ring buffer enabled
648 * @tr : the trace array to know if ring buffer is enabled
649 *
650 * Shows real state of the ring buffer if it is enabled or not.
651 */
652static int tracer_tracing_is_on(struct trace_array *tr)
653{
654 if (tr->trace_buffer.buffer)
655 return ring_buffer_record_is_on(tr->trace_buffer.buffer);
656 return !tr->buffer_disabled;
657}
658
565/** 659/**
566 * tracing_is_on - show state of ring buffers enabled 660 * tracing_is_on - show state of ring buffers enabled
567 */ 661 */
568int tracing_is_on(void) 662int tracing_is_on(void)
569{ 663{
570 if (global_trace.trace_buffer.buffer) 664 return tracer_tracing_is_on(&global_trace);
571 return ring_buffer_record_is_on(global_trace.trace_buffer.buffer);
572 return !global_trace.buffer_disabled;
573} 665}
574EXPORT_SYMBOL_GPL(tracing_is_on); 666EXPORT_SYMBOL_GPL(tracing_is_on);
575 667
@@ -1543,15 +1635,6 @@ trace_function(struct trace_array *tr,
1543 __buffer_unlock_commit(buffer, event); 1635 __buffer_unlock_commit(buffer, event);
1544} 1636}
1545 1637
1546void
1547ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1548 unsigned long ip, unsigned long parent_ip, unsigned long flags,
1549 int pc)
1550{
1551 if (likely(!atomic_read(&data->disabled)))
1552 trace_function(tr, ip, parent_ip, flags, pc);
1553}
1554
1555#ifdef CONFIG_STACKTRACE 1638#ifdef CONFIG_STACKTRACE
1556 1639
1557#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) 1640#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
@@ -2768,10 +2851,9 @@ static const struct seq_operations tracer_seq_ops = {
2768}; 2851};
2769 2852
2770static struct trace_iterator * 2853static struct trace_iterator *
2771__tracing_open(struct inode *inode, struct file *file, bool snapshot) 2854__tracing_open(struct trace_array *tr, struct trace_cpu *tc,
2855 struct inode *inode, struct file *file, bool snapshot)
2772{ 2856{
2773 struct trace_cpu *tc = inode->i_private;
2774 struct trace_array *tr = tc->tr;
2775 struct trace_iterator *iter; 2857 struct trace_iterator *iter;
2776 int cpu; 2858 int cpu;
2777 2859
@@ -2850,8 +2932,6 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
2850 tracing_iter_reset(iter, cpu); 2932 tracing_iter_reset(iter, cpu);
2851 } 2933 }
2852 2934
2853 tr->ref++;
2854
2855 mutex_unlock(&trace_types_lock); 2935 mutex_unlock(&trace_types_lock);
2856 2936
2857 return iter; 2937 return iter;
@@ -2874,6 +2954,43 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
2874 return 0; 2954 return 0;
2875} 2955}
2876 2956
2957/*
2958 * Open and update trace_array ref count.
2959 * Must have the current trace_array passed to it.
2960 */
2961static int tracing_open_generic_tr(struct inode *inode, struct file *filp)
2962{
2963 struct trace_array *tr = inode->i_private;
2964
2965 if (tracing_disabled)
2966 return -ENODEV;
2967
2968 if (trace_array_get(tr) < 0)
2969 return -ENODEV;
2970
2971 filp->private_data = inode->i_private;
2972
2973 return 0;
2974
2975}
2976
2977static int tracing_open_generic_tc(struct inode *inode, struct file *filp)
2978{
2979 struct trace_cpu *tc = inode->i_private;
2980 struct trace_array *tr = tc->tr;
2981
2982 if (tracing_disabled)
2983 return -ENODEV;
2984
2985 if (trace_array_get(tr) < 0)
2986 return -ENODEV;
2987
2988 filp->private_data = inode->i_private;
2989
2990 return 0;
2991
2992}
2993
2877static int tracing_release(struct inode *inode, struct file *file) 2994static int tracing_release(struct inode *inode, struct file *file)
2878{ 2995{
2879 struct seq_file *m = file->private_data; 2996 struct seq_file *m = file->private_data;
@@ -2881,17 +2998,20 @@ static int tracing_release(struct inode *inode, struct file *file)
2881 struct trace_array *tr; 2998 struct trace_array *tr;
2882 int cpu; 2999 int cpu;
2883 3000
2884 if (!(file->f_mode & FMODE_READ)) 3001 /* Writes do not use seq_file, need to grab tr from inode */
3002 if (!(file->f_mode & FMODE_READ)) {
3003 struct trace_cpu *tc = inode->i_private;
3004
3005 trace_array_put(tc->tr);
2885 return 0; 3006 return 0;
3007 }
2886 3008
2887 iter = m->private; 3009 iter = m->private;
2888 tr = iter->tr; 3010 tr = iter->tr;
3011 trace_array_put(tr);
2889 3012
2890 mutex_lock(&trace_types_lock); 3013 mutex_lock(&trace_types_lock);
2891 3014
2892 WARN_ON(!tr->ref);
2893 tr->ref--;
2894
2895 for_each_tracing_cpu(cpu) { 3015 for_each_tracing_cpu(cpu) {
2896 if (iter->buffer_iter[cpu]) 3016 if (iter->buffer_iter[cpu])
2897 ring_buffer_read_finish(iter->buffer_iter[cpu]); 3017 ring_buffer_read_finish(iter->buffer_iter[cpu]);
@@ -2910,20 +3030,49 @@ static int tracing_release(struct inode *inode, struct file *file)
2910 kfree(iter->trace); 3030 kfree(iter->trace);
2911 kfree(iter->buffer_iter); 3031 kfree(iter->buffer_iter);
2912 seq_release_private(inode, file); 3032 seq_release_private(inode, file);
3033
3034 return 0;
3035}
3036
3037static int tracing_release_generic_tr(struct inode *inode, struct file *file)
3038{
3039 struct trace_array *tr = inode->i_private;
3040
3041 trace_array_put(tr);
2913 return 0; 3042 return 0;
2914} 3043}
2915 3044
3045static int tracing_release_generic_tc(struct inode *inode, struct file *file)
3046{
3047 struct trace_cpu *tc = inode->i_private;
3048 struct trace_array *tr = tc->tr;
3049
3050 trace_array_put(tr);
3051 return 0;
3052}
3053
3054static int tracing_single_release_tr(struct inode *inode, struct file *file)
3055{
3056 struct trace_array *tr = inode->i_private;
3057
3058 trace_array_put(tr);
3059
3060 return single_release(inode, file);
3061}
3062
2916static int tracing_open(struct inode *inode, struct file *file) 3063static int tracing_open(struct inode *inode, struct file *file)
2917{ 3064{
3065 struct trace_cpu *tc = inode->i_private;
3066 struct trace_array *tr = tc->tr;
2918 struct trace_iterator *iter; 3067 struct trace_iterator *iter;
2919 int ret = 0; 3068 int ret = 0;
2920 3069
3070 if (trace_array_get(tr) < 0)
3071 return -ENODEV;
3072
2921 /* If this file was open for write, then erase contents */ 3073 /* If this file was open for write, then erase contents */
2922 if ((file->f_mode & FMODE_WRITE) && 3074 if ((file->f_mode & FMODE_WRITE) &&
2923 (file->f_flags & O_TRUNC)) { 3075 (file->f_flags & O_TRUNC)) {
2924 struct trace_cpu *tc = inode->i_private;
2925 struct trace_array *tr = tc->tr;
2926
2927 if (tc->cpu == RING_BUFFER_ALL_CPUS) 3076 if (tc->cpu == RING_BUFFER_ALL_CPUS)
2928 tracing_reset_online_cpus(&tr->trace_buffer); 3077 tracing_reset_online_cpus(&tr->trace_buffer);
2929 else 3078 else
@@ -2931,12 +3080,16 @@ static int tracing_open(struct inode *inode, struct file *file)
2931 } 3080 }
2932 3081
2933 if (file->f_mode & FMODE_READ) { 3082 if (file->f_mode & FMODE_READ) {
2934 iter = __tracing_open(inode, file, false); 3083 iter = __tracing_open(tr, tc, inode, file, false);
2935 if (IS_ERR(iter)) 3084 if (IS_ERR(iter))
2936 ret = PTR_ERR(iter); 3085 ret = PTR_ERR(iter);
2937 else if (trace_flags & TRACE_ITER_LATENCY_FMT) 3086 else if (trace_flags & TRACE_ITER_LATENCY_FMT)
2938 iter->iter_flags |= TRACE_FILE_LAT_FMT; 3087 iter->iter_flags |= TRACE_FILE_LAT_FMT;
2939 } 3088 }
3089
3090 if (ret < 0)
3091 trace_array_put(tr);
3092
2940 return ret; 3093 return ret;
2941} 3094}
2942 3095
@@ -3293,9 +3446,14 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
3293 3446
3294static int tracing_trace_options_open(struct inode *inode, struct file *file) 3447static int tracing_trace_options_open(struct inode *inode, struct file *file)
3295{ 3448{
3449 struct trace_array *tr = inode->i_private;
3450
3296 if (tracing_disabled) 3451 if (tracing_disabled)
3297 return -ENODEV; 3452 return -ENODEV;
3298 3453
3454 if (trace_array_get(tr) < 0)
3455 return -ENODEV;
3456
3299 return single_open(file, tracing_trace_options_show, inode->i_private); 3457 return single_open(file, tracing_trace_options_show, inode->i_private);
3300} 3458}
3301 3459
@@ -3303,7 +3461,7 @@ static const struct file_operations tracing_iter_fops = {
3303 .open = tracing_trace_options_open, 3461 .open = tracing_trace_options_open,
3304 .read = seq_read, 3462 .read = seq_read,
3305 .llseek = seq_lseek, 3463 .llseek = seq_lseek,
3306 .release = single_release, 3464 .release = tracing_single_release_tr,
3307 .write = tracing_trace_options_write, 3465 .write = tracing_trace_options_write,
3308}; 3466};
3309 3467
@@ -3791,6 +3949,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3791 if (tracing_disabled) 3949 if (tracing_disabled)
3792 return -ENODEV; 3950 return -ENODEV;
3793 3951
3952 if (trace_array_get(tr) < 0)
3953 return -ENODEV;
3954
3794 mutex_lock(&trace_types_lock); 3955 mutex_lock(&trace_types_lock);
3795 3956
3796 /* create a buffer to store the information to pass to userspace */ 3957 /* create a buffer to store the information to pass to userspace */
@@ -3843,6 +4004,7 @@ out:
3843fail: 4004fail:
3844 kfree(iter->trace); 4005 kfree(iter->trace);
3845 kfree(iter); 4006 kfree(iter);
4007 __trace_array_put(tr);
3846 mutex_unlock(&trace_types_lock); 4008 mutex_unlock(&trace_types_lock);
3847 return ret; 4009 return ret;
3848} 4010}
@@ -3850,6 +4012,8 @@ fail:
3850static int tracing_release_pipe(struct inode *inode, struct file *file) 4012static int tracing_release_pipe(struct inode *inode, struct file *file)
3851{ 4013{
3852 struct trace_iterator *iter = file->private_data; 4014 struct trace_iterator *iter = file->private_data;
4015 struct trace_cpu *tc = inode->i_private;
4016 struct trace_array *tr = tc->tr;
3853 4017
3854 mutex_lock(&trace_types_lock); 4018 mutex_lock(&trace_types_lock);
3855 4019
@@ -3863,6 +4027,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
3863 kfree(iter->trace); 4027 kfree(iter->trace);
3864 kfree(iter); 4028 kfree(iter);
3865 4029
4030 trace_array_put(tr);
4031
3866 return 0; 4032 return 0;
3867} 4033}
3868 4034
@@ -3939,7 +4105,7 @@ static int tracing_wait_pipe(struct file *filp)
3939 * 4105 *
3940 * iter->pos will be 0 if we haven't read anything. 4106 * iter->pos will be 0 if we haven't read anything.
3941 */ 4107 */
3942 if (!tracing_is_enabled() && iter->pos) 4108 if (!tracing_is_on() && iter->pos)
3943 break; 4109 break;
3944 } 4110 }
3945 4111
@@ -4320,6 +4486,8 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
4320 /* resize the ring buffer to 0 */ 4486 /* resize the ring buffer to 0 */
4321 tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); 4487 tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
4322 4488
4489 trace_array_put(tr);
4490
4323 return 0; 4491 return 0;
4324} 4492}
4325 4493
@@ -4328,6 +4496,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
4328 size_t cnt, loff_t *fpos) 4496 size_t cnt, loff_t *fpos)
4329{ 4497{
4330 unsigned long addr = (unsigned long)ubuf; 4498 unsigned long addr = (unsigned long)ubuf;
4499 struct trace_array *tr = filp->private_data;
4331 struct ring_buffer_event *event; 4500 struct ring_buffer_event *event;
4332 struct ring_buffer *buffer; 4501 struct ring_buffer *buffer;
4333 struct print_entry *entry; 4502 struct print_entry *entry;
@@ -4387,7 +4556,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
4387 4556
4388 local_save_flags(irq_flags); 4557 local_save_flags(irq_flags);
4389 size = sizeof(*entry) + cnt + 2; /* possible \n added */ 4558 size = sizeof(*entry) + cnt + 2; /* possible \n added */
4390 buffer = global_trace.trace_buffer.buffer; 4559 buffer = tr->trace_buffer.buffer;
4391 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 4560 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
4392 irq_flags, preempt_count()); 4561 irq_flags, preempt_count());
4393 if (!event) { 4562 if (!event) {
@@ -4495,10 +4664,20 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4495 4664
4496static int tracing_clock_open(struct inode *inode, struct file *file) 4665static int tracing_clock_open(struct inode *inode, struct file *file)
4497{ 4666{
4667 struct trace_array *tr = inode->i_private;
4668 int ret;
4669
4498 if (tracing_disabled) 4670 if (tracing_disabled)
4499 return -ENODEV; 4671 return -ENODEV;
4500 4672
4501 return single_open(file, tracing_clock_show, inode->i_private); 4673 if (trace_array_get(tr))
4674 return -ENODEV;
4675
4676 ret = single_open(file, tracing_clock_show, inode->i_private);
4677 if (ret < 0)
4678 trace_array_put(tr);
4679
4680 return ret;
4502} 4681}
4503 4682
4504struct ftrace_buffer_info { 4683struct ftrace_buffer_info {
@@ -4511,12 +4690,16 @@ struct ftrace_buffer_info {
4511static int tracing_snapshot_open(struct inode *inode, struct file *file) 4690static int tracing_snapshot_open(struct inode *inode, struct file *file)
4512{ 4691{
4513 struct trace_cpu *tc = inode->i_private; 4692 struct trace_cpu *tc = inode->i_private;
4693 struct trace_array *tr = tc->tr;
4514 struct trace_iterator *iter; 4694 struct trace_iterator *iter;
4515 struct seq_file *m; 4695 struct seq_file *m;
4516 int ret = 0; 4696 int ret = 0;
4517 4697
4698 if (trace_array_get(tr) < 0)
4699 return -ENODEV;
4700
4518 if (file->f_mode & FMODE_READ) { 4701 if (file->f_mode & FMODE_READ) {
4519 iter = __tracing_open(inode, file, true); 4702 iter = __tracing_open(tr, tc, inode, file, true);
4520 if (IS_ERR(iter)) 4703 if (IS_ERR(iter))
4521 ret = PTR_ERR(iter); 4704 ret = PTR_ERR(iter);
4522 } else { 4705 } else {
@@ -4529,13 +4712,16 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file)
4529 kfree(m); 4712 kfree(m);
4530 return -ENOMEM; 4713 return -ENOMEM;
4531 } 4714 }
4532 iter->tr = tc->tr; 4715 iter->tr = tr;
4533 iter->trace_buffer = &tc->tr->max_buffer; 4716 iter->trace_buffer = &tc->tr->max_buffer;
4534 iter->cpu_file = tc->cpu; 4717 iter->cpu_file = tc->cpu;
4535 m->private = iter; 4718 m->private = iter;
4536 file->private_data = m; 4719 file->private_data = m;
4537 } 4720 }
4538 4721
4722 if (ret < 0)
4723 trace_array_put(tr);
4724
4539 return ret; 4725 return ret;
4540} 4726}
4541 4727
@@ -4616,9 +4802,12 @@ out:
4616static int tracing_snapshot_release(struct inode *inode, struct file *file) 4802static int tracing_snapshot_release(struct inode *inode, struct file *file)
4617{ 4803{
4618 struct seq_file *m = file->private_data; 4804 struct seq_file *m = file->private_data;
4805 int ret;
4806
4807 ret = tracing_release(inode, file);
4619 4808
4620 if (file->f_mode & FMODE_READ) 4809 if (file->f_mode & FMODE_READ)
4621 return tracing_release(inode, file); 4810 return ret;
4622 4811
4623 /* If write only, the seq_file is just a stub */ 4812 /* If write only, the seq_file is just a stub */
4624 if (m) 4813 if (m)
@@ -4684,34 +4873,38 @@ static const struct file_operations tracing_pipe_fops = {
4684}; 4873};
4685 4874
4686static const struct file_operations tracing_entries_fops = { 4875static const struct file_operations tracing_entries_fops = {
4687 .open = tracing_open_generic, 4876 .open = tracing_open_generic_tc,
4688 .read = tracing_entries_read, 4877 .read = tracing_entries_read,
4689 .write = tracing_entries_write, 4878 .write = tracing_entries_write,
4690 .llseek = generic_file_llseek, 4879 .llseek = generic_file_llseek,
4880 .release = tracing_release_generic_tc,
4691}; 4881};
4692 4882
4693static const struct file_operations tracing_total_entries_fops = { 4883static const struct file_operations tracing_total_entries_fops = {
4694 .open = tracing_open_generic, 4884 .open = tracing_open_generic_tr,
4695 .read = tracing_total_entries_read, 4885 .read = tracing_total_entries_read,
4696 .llseek = generic_file_llseek, 4886 .llseek = generic_file_llseek,
4887 .release = tracing_release_generic_tr,
4697}; 4888};
4698 4889
4699static const struct file_operations tracing_free_buffer_fops = { 4890static const struct file_operations tracing_free_buffer_fops = {
4891 .open = tracing_open_generic_tr,
4700 .write = tracing_free_buffer_write, 4892 .write = tracing_free_buffer_write,
4701 .release = tracing_free_buffer_release, 4893 .release = tracing_free_buffer_release,
4702}; 4894};
4703 4895
4704static const struct file_operations tracing_mark_fops = { 4896static const struct file_operations tracing_mark_fops = {
4705 .open = tracing_open_generic, 4897 .open = tracing_open_generic_tr,
4706 .write = tracing_mark_write, 4898 .write = tracing_mark_write,
4707 .llseek = generic_file_llseek, 4899 .llseek = generic_file_llseek,
4900 .release = tracing_release_generic_tr,
4708}; 4901};
4709 4902
4710static const struct file_operations trace_clock_fops = { 4903static const struct file_operations trace_clock_fops = {
4711 .open = tracing_clock_open, 4904 .open = tracing_clock_open,
4712 .read = seq_read, 4905 .read = seq_read,
4713 .llseek = seq_lseek, 4906 .llseek = seq_lseek,
4714 .release = single_release, 4907 .release = tracing_single_release_tr,
4715 .write = tracing_clock_write, 4908 .write = tracing_clock_write,
4716}; 4909};
4717 4910
@@ -4739,13 +4932,19 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
4739 struct trace_cpu *tc = inode->i_private; 4932 struct trace_cpu *tc = inode->i_private;
4740 struct trace_array *tr = tc->tr; 4933 struct trace_array *tr = tc->tr;
4741 struct ftrace_buffer_info *info; 4934 struct ftrace_buffer_info *info;
4935 int ret;
4742 4936
4743 if (tracing_disabled) 4937 if (tracing_disabled)
4744 return -ENODEV; 4938 return -ENODEV;
4745 4939
4940 if (trace_array_get(tr) < 0)
4941 return -ENODEV;
4942
4746 info = kzalloc(sizeof(*info), GFP_KERNEL); 4943 info = kzalloc(sizeof(*info), GFP_KERNEL);
4747 if (!info) 4944 if (!info) {
4945 trace_array_put(tr);
4748 return -ENOMEM; 4946 return -ENOMEM;
4947 }
4749 4948
4750 mutex_lock(&trace_types_lock); 4949 mutex_lock(&trace_types_lock);
4751 4950
@@ -4763,7 +4962,11 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
4763 4962
4764 mutex_unlock(&trace_types_lock); 4963 mutex_unlock(&trace_types_lock);
4765 4964
4766 return nonseekable_open(inode, filp); 4965 ret = nonseekable_open(inode, filp);
4966 if (ret < 0)
4967 trace_array_put(tr);
4968
4969 return ret;
4767} 4970}
4768 4971
4769static unsigned int 4972static unsigned int
@@ -4863,8 +5066,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
4863 5066
4864 mutex_lock(&trace_types_lock); 5067 mutex_lock(&trace_types_lock);
4865 5068
4866 WARN_ON(!iter->tr->ref); 5069 __trace_array_put(iter->tr);
4867 iter->tr->ref--;
4868 5070
4869 if (info->spare) 5071 if (info->spare)
4870 ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); 5072 ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);
@@ -5612,15 +5814,10 @@ rb_simple_read(struct file *filp, char __user *ubuf,
5612 size_t cnt, loff_t *ppos) 5814 size_t cnt, loff_t *ppos)
5613{ 5815{
5614 struct trace_array *tr = filp->private_data; 5816 struct trace_array *tr = filp->private_data;
5615 struct ring_buffer *buffer = tr->trace_buffer.buffer;
5616 char buf[64]; 5817 char buf[64];
5617 int r; 5818 int r;
5618 5819
5619 if (buffer) 5820 r = tracer_tracing_is_on(tr);
5620 r = ring_buffer_record_is_on(buffer);
5621 else
5622 r = 0;
5623
5624 r = sprintf(buf, "%d\n", r); 5821 r = sprintf(buf, "%d\n", r);
5625 5822
5626 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 5823 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -5642,11 +5839,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
5642 if (buffer) { 5839 if (buffer) {
5643 mutex_lock(&trace_types_lock); 5840 mutex_lock(&trace_types_lock);
5644 if (val) { 5841 if (val) {
5645 ring_buffer_record_on(buffer); 5842 tracer_tracing_on(tr);
5646 if (tr->current_trace->start) 5843 if (tr->current_trace->start)
5647 tr->current_trace->start(tr); 5844 tr->current_trace->start(tr);
5648 } else { 5845 } else {
5649 ring_buffer_record_off(buffer); 5846 tracer_tracing_off(tr);
5650 if (tr->current_trace->stop) 5847 if (tr->current_trace->stop)
5651 tr->current_trace->stop(tr); 5848 tr->current_trace->stop(tr);
5652 } 5849 }
@@ -5659,9 +5856,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
5659} 5856}
5660 5857
5661static const struct file_operations rb_simple_fops = { 5858static const struct file_operations rb_simple_fops = {
5662 .open = tracing_open_generic, 5859 .open = tracing_open_generic_tr,
5663 .read = rb_simple_read, 5860 .read = rb_simple_read,
5664 .write = rb_simple_write, 5861 .write = rb_simple_write,
5862 .release = tracing_release_generic_tr,
5665 .llseek = default_llseek, 5863 .llseek = default_llseek,
5666}; 5864};
5667 5865
@@ -5933,7 +6131,7 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
5933 trace_create_file("buffer_total_size_kb", 0444, d_tracer, 6131 trace_create_file("buffer_total_size_kb", 0444, d_tracer,
5934 tr, &tracing_total_entries_fops); 6132 tr, &tracing_total_entries_fops);
5935 6133
5936 trace_create_file("free_buffer", 0644, d_tracer, 6134 trace_create_file("free_buffer", 0200, d_tracer,
5937 tr, &tracing_free_buffer_fops); 6135 tr, &tracing_free_buffer_fops);
5938 6136
5939 trace_create_file("trace_marker", 0220, d_tracer, 6137 trace_create_file("trace_marker", 0220, d_tracer,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 20572ed88c5c..4a4f6e1828b6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -224,6 +224,11 @@ enum {
224 224
225extern struct list_head ftrace_trace_arrays; 225extern struct list_head ftrace_trace_arrays;
226 226
227extern struct mutex trace_types_lock;
228
229extern int trace_array_get(struct trace_array *tr);
230extern void trace_array_put(struct trace_array *tr);
231
227/* 232/*
228 * The global tracer (top) should be the first trace array added, 233 * The global tracer (top) should be the first trace array added,
229 * but we check the flag anyway. 234 * but we check the flag anyway.
@@ -554,11 +559,6 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu);
554 559
555void poll_wait_pipe(struct trace_iterator *iter); 560void poll_wait_pipe(struct trace_iterator *iter);
556 561
557void ftrace(struct trace_array *tr,
558 struct trace_array_cpu *data,
559 unsigned long ip,
560 unsigned long parent_ip,
561 unsigned long flags, int pc);
562void tracing_sched_switch_trace(struct trace_array *tr, 562void tracing_sched_switch_trace(struct trace_array *tr,
563 struct task_struct *prev, 563 struct task_struct *prev,
564 struct task_struct *next, 564 struct task_struct *next,
@@ -774,6 +774,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
774extern struct list_head ftrace_pids; 774extern struct list_head ftrace_pids;
775 775
776#ifdef CONFIG_FUNCTION_TRACER 776#ifdef CONFIG_FUNCTION_TRACER
777extern bool ftrace_filter_param __initdata;
777static inline int ftrace_trace_task(struct task_struct *task) 778static inline int ftrace_trace_task(struct task_struct *task)
778{ 779{
779 if (list_empty(&ftrace_pids)) 780 if (list_empty(&ftrace_pids))
@@ -899,12 +900,6 @@ static inline void trace_branch_disable(void)
899/* set ring buffers to default size if not already done so */ 900/* set ring buffers to default size if not already done so */
900int tracing_update_buffers(void); 901int tracing_update_buffers(void);
901 902
902/* trace event type bit fields, not numeric */
903enum {
904 TRACE_EVENT_TYPE_PRINTF = 1,
905 TRACE_EVENT_TYPE_RAW = 2,
906};
907
908struct ftrace_event_field { 903struct ftrace_event_field {
909 struct list_head link; 904 struct list_head link;
910 const char *name; 905 const char *name;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 27963e2bf4bf..7d854290bf81 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -41,6 +41,23 @@ static LIST_HEAD(ftrace_common_fields);
41static struct kmem_cache *field_cachep; 41static struct kmem_cache *field_cachep;
42static struct kmem_cache *file_cachep; 42static struct kmem_cache *file_cachep;
43 43
44#define SYSTEM_FL_FREE_NAME (1 << 31)
45
46static inline int system_refcount(struct event_subsystem *system)
47{
48 return system->ref_count & ~SYSTEM_FL_FREE_NAME;
49}
50
51static int system_refcount_inc(struct event_subsystem *system)
52{
53 return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME;
54}
55
56static int system_refcount_dec(struct event_subsystem *system)
57{
58 return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME;
59}
60
44/* Double loops, do not use break, only goto's work */ 61/* Double loops, do not use break, only goto's work */
45#define do_for_each_event_file(tr, file) \ 62#define do_for_each_event_file(tr, file) \
46 list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ 63 list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
@@ -97,7 +114,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
97 114
98 field = kmem_cache_alloc(field_cachep, GFP_TRACE); 115 field = kmem_cache_alloc(field_cachep, GFP_TRACE);
99 if (!field) 116 if (!field)
100 goto err; 117 return -ENOMEM;
101 118
102 field->name = name; 119 field->name = name;
103 field->type = type; 120 field->type = type;
@@ -114,11 +131,6 @@ static int __trace_define_field(struct list_head *head, const char *type,
114 list_add(&field->link, head); 131 list_add(&field->link, head);
115 132
116 return 0; 133 return 0;
117
118err:
119 kmem_cache_free(field_cachep, field);
120
121 return -ENOMEM;
122} 134}
123 135
124int trace_define_field(struct ftrace_event_call *call, const char *type, 136int trace_define_field(struct ftrace_event_call *call, const char *type,
@@ -279,9 +291,11 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
279 } 291 }
280 call->class->reg(call, TRACE_REG_UNREGISTER, file); 292 call->class->reg(call, TRACE_REG_UNREGISTER, file);
281 } 293 }
282 /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */ 294 /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
283 if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) 295 if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
284 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); 296 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
297 else
298 clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
285 break; 299 break;
286 case 1: 300 case 1:
287 /* 301 /*
@@ -349,8 +363,8 @@ static void __put_system(struct event_subsystem *system)
349{ 363{
350 struct event_filter *filter = system->filter; 364 struct event_filter *filter = system->filter;
351 365
352 WARN_ON_ONCE(system->ref_count == 0); 366 WARN_ON_ONCE(system_refcount(system) == 0);
353 if (--system->ref_count) 367 if (system_refcount_dec(system))
354 return; 368 return;
355 369
356 list_del(&system->list); 370 list_del(&system->list);
@@ -359,13 +373,15 @@ static void __put_system(struct event_subsystem *system)
359 kfree(filter->filter_string); 373 kfree(filter->filter_string);
360 kfree(filter); 374 kfree(filter);
361 } 375 }
376 if (system->ref_count & SYSTEM_FL_FREE_NAME)
377 kfree(system->name);
362 kfree(system); 378 kfree(system);
363} 379}
364 380
365static void __get_system(struct event_subsystem *system) 381static void __get_system(struct event_subsystem *system)
366{ 382{
367 WARN_ON_ONCE(system->ref_count == 0); 383 WARN_ON_ONCE(system_refcount(system) == 0);
368 system->ref_count++; 384 system_refcount_inc(system);
369} 385}
370 386
371static void __get_system_dir(struct ftrace_subsystem_dir *dir) 387static void __get_system_dir(struct ftrace_subsystem_dir *dir)
@@ -379,7 +395,7 @@ static void __put_system_dir(struct ftrace_subsystem_dir *dir)
379{ 395{
380 WARN_ON_ONCE(dir->ref_count == 0); 396 WARN_ON_ONCE(dir->ref_count == 0);
381 /* If the subsystem is about to be freed, the dir must be too */ 397 /* If the subsystem is about to be freed, the dir must be too */
382 WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1); 398 WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1);
383 399
384 __put_system(dir->subsystem); 400 __put_system(dir->subsystem);
385 if (!--dir->ref_count) 401 if (!--dir->ref_count)
@@ -394,16 +410,45 @@ static void put_system(struct ftrace_subsystem_dir *dir)
394} 410}
395 411
396/* 412/*
413 * Open and update trace_array ref count.
414 * Must have the current trace_array passed to it.
415 */
416static int tracing_open_generic_file(struct inode *inode, struct file *filp)
417{
418 struct ftrace_event_file *file = inode->i_private;
419 struct trace_array *tr = file->tr;
420 int ret;
421
422 if (trace_array_get(tr) < 0)
423 return -ENODEV;
424
425 ret = tracing_open_generic(inode, filp);
426 if (ret < 0)
427 trace_array_put(tr);
428 return ret;
429}
430
431static int tracing_release_generic_file(struct inode *inode, struct file *filp)
432{
433 struct ftrace_event_file *file = inode->i_private;
434 struct trace_array *tr = file->tr;
435
436 trace_array_put(tr);
437
438 return 0;
439}
440
441/*
397 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. 442 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
398 */ 443 */
399static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, 444static int
400 const char *sub, const char *event, int set) 445__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
446 const char *sub, const char *event, int set)
401{ 447{
402 struct ftrace_event_file *file; 448 struct ftrace_event_file *file;
403 struct ftrace_event_call *call; 449 struct ftrace_event_call *call;
404 int ret = -EINVAL; 450 int ret = -EINVAL;
405 451
406 mutex_lock(&event_mutex);
407 list_for_each_entry(file, &tr->events, list) { 452 list_for_each_entry(file, &tr->events, list) {
408 453
409 call = file->event_call; 454 call = file->event_call;
@@ -429,6 +474,17 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
429 474
430 ret = 0; 475 ret = 0;
431 } 476 }
477
478 return ret;
479}
480
481static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
482 const char *sub, const char *event, int set)
483{
484 int ret;
485
486 mutex_lock(&event_mutex);
487 ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set);
432 mutex_unlock(&event_mutex); 488 mutex_unlock(&event_mutex);
433 489
434 return ret; 490 return ret;
@@ -624,17 +680,17 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
624 loff_t *ppos) 680 loff_t *ppos)
625{ 681{
626 struct ftrace_event_file *file = filp->private_data; 682 struct ftrace_event_file *file = filp->private_data;
627 char *buf; 683 char buf[4] = "0";
628 684
629 if (file->flags & FTRACE_EVENT_FL_ENABLED) { 685 if (file->flags & FTRACE_EVENT_FL_ENABLED &&
630 if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED) 686 !(file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
631 buf = "0*\n"; 687 strcpy(buf, "1");
632 else if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) 688
633 buf = "1*\n"; 689 if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED ||
634 else 690 file->flags & FTRACE_EVENT_FL_SOFT_MODE)
635 buf = "1\n"; 691 strcat(buf, "*");
636 } else 692
637 buf = "0\n"; 693 strcat(buf, "\n");
638 694
639 return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf)); 695 return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));
640} 696}
@@ -992,6 +1048,7 @@ static int subsystem_open(struct inode *inode, struct file *filp)
992 int ret; 1048 int ret;
993 1049
994 /* Make sure the system still exists */ 1050 /* Make sure the system still exists */
1051 mutex_lock(&trace_types_lock);
995 mutex_lock(&event_mutex); 1052 mutex_lock(&event_mutex);
996 list_for_each_entry(tr, &ftrace_trace_arrays, list) { 1053 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
997 list_for_each_entry(dir, &tr->systems, list) { 1054 list_for_each_entry(dir, &tr->systems, list) {
@@ -1007,6 +1064,7 @@ static int subsystem_open(struct inode *inode, struct file *filp)
1007 } 1064 }
1008 exit_loop: 1065 exit_loop:
1009 mutex_unlock(&event_mutex); 1066 mutex_unlock(&event_mutex);
1067 mutex_unlock(&trace_types_lock);
1010 1068
1011 if (!system) 1069 if (!system)
1012 return -ENODEV; 1070 return -ENODEV;
@@ -1014,9 +1072,17 @@ static int subsystem_open(struct inode *inode, struct file *filp)
1014 /* Some versions of gcc think dir can be uninitialized here */ 1072 /* Some versions of gcc think dir can be uninitialized here */
1015 WARN_ON(!dir); 1073 WARN_ON(!dir);
1016 1074
1075 /* Still need to increment the ref count of the system */
1076 if (trace_array_get(tr) < 0) {
1077 put_system(dir);
1078 return -ENODEV;
1079 }
1080
1017 ret = tracing_open_generic(inode, filp); 1081 ret = tracing_open_generic(inode, filp);
1018 if (ret < 0) 1082 if (ret < 0) {
1083 trace_array_put(tr);
1019 put_system(dir); 1084 put_system(dir);
1085 }
1020 1086
1021 return ret; 1087 return ret;
1022} 1088}
@@ -1027,16 +1093,23 @@ static int system_tr_open(struct inode *inode, struct file *filp)
1027 struct trace_array *tr = inode->i_private; 1093 struct trace_array *tr = inode->i_private;
1028 int ret; 1094 int ret;
1029 1095
1096 if (trace_array_get(tr) < 0)
1097 return -ENODEV;
1098
1030 /* Make a temporary dir that has no system but points to tr */ 1099 /* Make a temporary dir that has no system but points to tr */
1031 dir = kzalloc(sizeof(*dir), GFP_KERNEL); 1100 dir = kzalloc(sizeof(*dir), GFP_KERNEL);
1032 if (!dir) 1101 if (!dir) {
1102 trace_array_put(tr);
1033 return -ENOMEM; 1103 return -ENOMEM;
1104 }
1034 1105
1035 dir->tr = tr; 1106 dir->tr = tr;
1036 1107
1037 ret = tracing_open_generic(inode, filp); 1108 ret = tracing_open_generic(inode, filp);
1038 if (ret < 0) 1109 if (ret < 0) {
1110 trace_array_put(tr);
1039 kfree(dir); 1111 kfree(dir);
1112 }
1040 1113
1041 filp->private_data = dir; 1114 filp->private_data = dir;
1042 1115
@@ -1047,6 +1120,8 @@ static int subsystem_release(struct inode *inode, struct file *file)
1047{ 1120{
1048 struct ftrace_subsystem_dir *dir = file->private_data; 1121 struct ftrace_subsystem_dir *dir = file->private_data;
1049 1122
1123 trace_array_put(dir->tr);
1124
1050 /* 1125 /*
1051 * If dir->subsystem is NULL, then this is a temporary 1126 * If dir->subsystem is NULL, then this is a temporary
1052 * descriptor that was made for a trace_array to enable 1127 * descriptor that was made for a trace_array to enable
@@ -1174,9 +1249,10 @@ static const struct file_operations ftrace_set_event_fops = {
1174}; 1249};
1175 1250
1176static const struct file_operations ftrace_enable_fops = { 1251static const struct file_operations ftrace_enable_fops = {
1177 .open = tracing_open_generic, 1252 .open = tracing_open_generic_file,
1178 .read = event_enable_read, 1253 .read = event_enable_read,
1179 .write = event_enable_write, 1254 .write = event_enable_write,
1255 .release = tracing_release_generic_file,
1180 .llseek = default_llseek, 1256 .llseek = default_llseek,
1181}; 1257};
1182 1258
@@ -1279,7 +1355,15 @@ create_new_subsystem(const char *name)
1279 return NULL; 1355 return NULL;
1280 1356
1281 system->ref_count = 1; 1357 system->ref_count = 1;
1282 system->name = name; 1358
1359 /* Only allocate if dynamic (kprobes and modules) */
1360 if (!core_kernel_data((unsigned long)name)) {
1361 system->ref_count |= SYSTEM_FL_FREE_NAME;
1362 system->name = kstrdup(name, GFP_KERNEL);
1363 if (!system->name)
1364 goto out_free;
1365 } else
1366 system->name = name;
1283 1367
1284 system->filter = NULL; 1368 system->filter = NULL;
1285 1369
@@ -1292,6 +1376,8 @@ create_new_subsystem(const char *name)
1292 return system; 1376 return system;
1293 1377
1294 out_free: 1378 out_free:
1379 if (system->ref_count & SYSTEM_FL_FREE_NAME)
1380 kfree(system->name);
1295 kfree(system); 1381 kfree(system);
1296 return NULL; 1382 return NULL;
1297} 1383}
@@ -1591,6 +1677,7 @@ static void __add_event_to_tracers(struct ftrace_event_call *call,
1591int trace_add_event_call(struct ftrace_event_call *call) 1677int trace_add_event_call(struct ftrace_event_call *call)
1592{ 1678{
1593 int ret; 1679 int ret;
1680 mutex_lock(&trace_types_lock);
1594 mutex_lock(&event_mutex); 1681 mutex_lock(&event_mutex);
1595 1682
1596 ret = __register_event(call, NULL); 1683 ret = __register_event(call, NULL);
@@ -1598,11 +1685,13 @@ int trace_add_event_call(struct ftrace_event_call *call)
1598 __add_event_to_tracers(call, NULL); 1685 __add_event_to_tracers(call, NULL);
1599 1686
1600 mutex_unlock(&event_mutex); 1687 mutex_unlock(&event_mutex);
1688 mutex_unlock(&trace_types_lock);
1601 return ret; 1689 return ret;
1602} 1690}
1603 1691
1604/* 1692/*
1605 * Must be called under locking both of event_mutex and trace_event_sem. 1693 * Must be called under locking of trace_types_lock, event_mutex and
1694 * trace_event_sem.
1606 */ 1695 */
1607static void __trace_remove_event_call(struct ftrace_event_call *call) 1696static void __trace_remove_event_call(struct ftrace_event_call *call)
1608{ 1697{
@@ -1614,11 +1703,13 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
1614/* Remove an event_call */ 1703/* Remove an event_call */
1615void trace_remove_event_call(struct ftrace_event_call *call) 1704void trace_remove_event_call(struct ftrace_event_call *call)
1616{ 1705{
1706 mutex_lock(&trace_types_lock);
1617 mutex_lock(&event_mutex); 1707 mutex_lock(&event_mutex);
1618 down_write(&trace_event_sem); 1708 down_write(&trace_event_sem);
1619 __trace_remove_event_call(call); 1709 __trace_remove_event_call(call);
1620 up_write(&trace_event_sem); 1710 up_write(&trace_event_sem);
1621 mutex_unlock(&event_mutex); 1711 mutex_unlock(&event_mutex);
1712 mutex_unlock(&trace_types_lock);
1622} 1713}
1623 1714
1624#define for_each_event(event, start, end) \ 1715#define for_each_event(event, start, end) \
@@ -1762,6 +1853,7 @@ static int trace_module_notify(struct notifier_block *self,
1762{ 1853{
1763 struct module *mod = data; 1854 struct module *mod = data;
1764 1855
1856 mutex_lock(&trace_types_lock);
1765 mutex_lock(&event_mutex); 1857 mutex_lock(&event_mutex);
1766 switch (val) { 1858 switch (val) {
1767 case MODULE_STATE_COMING: 1859 case MODULE_STATE_COMING:
@@ -1772,6 +1864,7 @@ static int trace_module_notify(struct notifier_block *self,
1772 break; 1864 break;
1773 } 1865 }
1774 mutex_unlock(&event_mutex); 1866 mutex_unlock(&event_mutex);
1867 mutex_unlock(&trace_types_lock);
1775 1868
1776 return 0; 1869 return 0;
1777} 1870}
@@ -2011,10 +2104,7 @@ event_enable_func(struct ftrace_hash *hash,
2011 int ret; 2104 int ret;
2012 2105
2013 /* hash funcs only work with set_ftrace_filter */ 2106 /* hash funcs only work with set_ftrace_filter */
2014 if (!enabled) 2107 if (!enabled || !param)
2015 return -EINVAL;
2016
2017 if (!param)
2018 return -EINVAL; 2108 return -EINVAL;
2019 2109
2020 system = strsep(&param, ":"); 2110 system = strsep(&param, ":");
@@ -2329,11 +2419,11 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
2329 2419
2330int event_trace_del_tracer(struct trace_array *tr) 2420int event_trace_del_tracer(struct trace_array *tr)
2331{ 2421{
2332 /* Disable any running events */
2333 __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
2334
2335 mutex_lock(&event_mutex); 2422 mutex_lock(&event_mutex);
2336 2423
2424 /* Disable any running events */
2425 __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
2426
2337 down_write(&trace_event_sem); 2427 down_write(&trace_event_sem);
2338 __trace_remove_event_dirs(tr); 2428 __trace_remove_event_dirs(tr);
2339 debugfs_remove_recursive(tr->event_dir); 2429 debugfs_remove_recursive(tr->event_dir);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e1b653f7e1ca..0d883dc057d6 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -44,6 +44,7 @@ enum filter_op_ids
44 OP_LE, 44 OP_LE,
45 OP_GT, 45 OP_GT,
46 OP_GE, 46 OP_GE,
47 OP_BAND,
47 OP_NONE, 48 OP_NONE,
48 OP_OPEN_PAREN, 49 OP_OPEN_PAREN,
49}; 50};
@@ -54,6 +55,7 @@ struct filter_op {
54 int precedence; 55 int precedence;
55}; 56};
56 57
58/* Order must be the same as enum filter_op_ids above */
57static struct filter_op filter_ops[] = { 59static struct filter_op filter_ops[] = {
58 { OP_OR, "||", 1 }, 60 { OP_OR, "||", 1 },
59 { OP_AND, "&&", 2 }, 61 { OP_AND, "&&", 2 },
@@ -64,6 +66,7 @@ static struct filter_op filter_ops[] = {
64 { OP_LE, "<=", 5 }, 66 { OP_LE, "<=", 5 },
65 { OP_GT, ">", 5 }, 67 { OP_GT, ">", 5 },
66 { OP_GE, ">=", 5 }, 68 { OP_GE, ">=", 5 },
69 { OP_BAND, "&", 6 },
67 { OP_NONE, "OP_NONE", 0 }, 70 { OP_NONE, "OP_NONE", 0 },
68 { OP_OPEN_PAREN, "(", 0 }, 71 { OP_OPEN_PAREN, "(", 0 },
69}; 72};
@@ -156,6 +159,9 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \
156 case OP_GE: \ 159 case OP_GE: \
157 match = (*addr >= val); \ 160 match = (*addr >= val); \
158 break; \ 161 break; \
162 case OP_BAND: \
163 match = (*addr & val); \
164 break; \
159 default: \ 165 default: \
160 break; \ 166 break; \
161 } \ 167 } \
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c4d6d7191988..b863f93b30f3 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -290,6 +290,21 @@ ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
290 trace_dump_stack(STACK_SKIP); 290 trace_dump_stack(STACK_SKIP);
291} 291}
292 292
293static void
294ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data)
295{
296 if (update_count(data))
297 ftrace_dump(DUMP_ALL);
298}
299
300/* Only dump the current CPU buffer. */
301static void
302ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data)
303{
304 if (update_count(data))
305 ftrace_dump(DUMP_ORIG);
306}
307
293static int 308static int
294ftrace_probe_print(const char *name, struct seq_file *m, 309ftrace_probe_print(const char *name, struct seq_file *m,
295 unsigned long ip, void *data) 310 unsigned long ip, void *data)
@@ -327,6 +342,20 @@ ftrace_stacktrace_print(struct seq_file *m, unsigned long ip,
327 return ftrace_probe_print("stacktrace", m, ip, data); 342 return ftrace_probe_print("stacktrace", m, ip, data);
328} 343}
329 344
345static int
346ftrace_dump_print(struct seq_file *m, unsigned long ip,
347 struct ftrace_probe_ops *ops, void *data)
348{
349 return ftrace_probe_print("dump", m, ip, data);
350}
351
352static int
353ftrace_cpudump_print(struct seq_file *m, unsigned long ip,
354 struct ftrace_probe_ops *ops, void *data)
355{
356 return ftrace_probe_print("cpudump", m, ip, data);
357}
358
330static struct ftrace_probe_ops traceon_count_probe_ops = { 359static struct ftrace_probe_ops traceon_count_probe_ops = {
331 .func = ftrace_traceon_count, 360 .func = ftrace_traceon_count,
332 .print = ftrace_traceon_print, 361 .print = ftrace_traceon_print,
@@ -342,6 +371,16 @@ static struct ftrace_probe_ops stacktrace_count_probe_ops = {
342 .print = ftrace_stacktrace_print, 371 .print = ftrace_stacktrace_print,
343}; 372};
344 373
374static struct ftrace_probe_ops dump_probe_ops = {
375 .func = ftrace_dump_probe,
376 .print = ftrace_dump_print,
377};
378
379static struct ftrace_probe_ops cpudump_probe_ops = {
380 .func = ftrace_cpudump_probe,
381 .print = ftrace_cpudump_print,
382};
383
345static struct ftrace_probe_ops traceon_probe_ops = { 384static struct ftrace_probe_ops traceon_probe_ops = {
346 .func = ftrace_traceon, 385 .func = ftrace_traceon,
347 .print = ftrace_traceon_print, 386 .print = ftrace_traceon_print,
@@ -425,6 +464,32 @@ ftrace_stacktrace_callback(struct ftrace_hash *hash,
425 param, enable); 464 param, enable);
426} 465}
427 466
467static int
468ftrace_dump_callback(struct ftrace_hash *hash,
469 char *glob, char *cmd, char *param, int enable)
470{
471 struct ftrace_probe_ops *ops;
472
473 ops = &dump_probe_ops;
474
475 /* Only dump once. */
476 return ftrace_trace_probe_callback(ops, hash, glob, cmd,
477 "1", enable);
478}
479
480static int
481ftrace_cpudump_callback(struct ftrace_hash *hash,
482 char *glob, char *cmd, char *param, int enable)
483{
484 struct ftrace_probe_ops *ops;
485
486 ops = &cpudump_probe_ops;
487
488 /* Only dump once. */
489 return ftrace_trace_probe_callback(ops, hash, glob, cmd,
490 "1", enable);
491}
492
428static struct ftrace_func_command ftrace_traceon_cmd = { 493static struct ftrace_func_command ftrace_traceon_cmd = {
429 .name = "traceon", 494 .name = "traceon",
430 .func = ftrace_trace_onoff_callback, 495 .func = ftrace_trace_onoff_callback,
@@ -440,6 +505,16 @@ static struct ftrace_func_command ftrace_stacktrace_cmd = {
440 .func = ftrace_stacktrace_callback, 505 .func = ftrace_stacktrace_callback,
441}; 506};
442 507
508static struct ftrace_func_command ftrace_dump_cmd = {
509 .name = "dump",
510 .func = ftrace_dump_callback,
511};
512
513static struct ftrace_func_command ftrace_cpudump_cmd = {
514 .name = "cpudump",
515 .func = ftrace_cpudump_callback,
516};
517
443static int __init init_func_cmd_traceon(void) 518static int __init init_func_cmd_traceon(void)
444{ 519{
445 int ret; 520 int ret;
@@ -450,13 +525,31 @@ static int __init init_func_cmd_traceon(void)
450 525
451 ret = register_ftrace_command(&ftrace_traceon_cmd); 526 ret = register_ftrace_command(&ftrace_traceon_cmd);
452 if (ret) 527 if (ret)
453 unregister_ftrace_command(&ftrace_traceoff_cmd); 528 goto out_free_traceoff;
454 529
455 ret = register_ftrace_command(&ftrace_stacktrace_cmd); 530 ret = register_ftrace_command(&ftrace_stacktrace_cmd);
456 if (ret) { 531 if (ret)
457 unregister_ftrace_command(&ftrace_traceoff_cmd); 532 goto out_free_traceon;
458 unregister_ftrace_command(&ftrace_traceon_cmd); 533
459 } 534 ret = register_ftrace_command(&ftrace_dump_cmd);
535 if (ret)
536 goto out_free_stacktrace;
537
538 ret = register_ftrace_command(&ftrace_cpudump_cmd);
539 if (ret)
540 goto out_free_dump;
541
542 return 0;
543
544 out_free_dump:
545 unregister_ftrace_command(&ftrace_dump_cmd);
546 out_free_stacktrace:
547 unregister_ftrace_command(&ftrace_stacktrace_cmd);
548 out_free_traceon:
549 unregister_ftrace_command(&ftrace_traceon_cmd);
550 out_free_traceoff:
551 unregister_ftrace_command(&ftrace_traceoff_cmd);
552
460 return ret; 553 return ret;
461} 554}
462#else 555#else
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index b19d065a28cb..2aefbee93a6d 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -373,7 +373,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
373 struct trace_array_cpu *data; 373 struct trace_array_cpu *data;
374 unsigned long flags; 374 unsigned long flags;
375 375
376 if (likely(!tracer_enabled)) 376 if (!tracer_enabled || !tracing_is_enabled())
377 return; 377 return;
378 378
379 cpu = raw_smp_processor_id(); 379 cpu = raw_smp_processor_id();
@@ -416,7 +416,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
416 else 416 else
417 return; 417 return;
418 418
419 if (!tracer_enabled) 419 if (!tracer_enabled || !tracing_is_enabled())
420 return; 420 return;
421 421
422 data = per_cpu_ptr(tr->trace_buffer.data, cpu); 422 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9f46e98ba8f2..7ed6976493c8 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -35,12 +35,17 @@ struct trace_probe {
35 const char *symbol; /* symbol name */ 35 const char *symbol; /* symbol name */
36 struct ftrace_event_class class; 36 struct ftrace_event_class class;
37 struct ftrace_event_call call; 37 struct ftrace_event_call call;
38 struct ftrace_event_file * __rcu *files; 38 struct list_head files;
39 ssize_t size; /* trace entry size */ 39 ssize_t size; /* trace entry size */
40 unsigned int nr_args; 40 unsigned int nr_args;
41 struct probe_arg args[]; 41 struct probe_arg args[];
42}; 42};
43 43
44struct event_file_link {
45 struct ftrace_event_file *file;
46 struct list_head list;
47};
48
44#define SIZEOF_TRACE_PROBE(n) \ 49#define SIZEOF_TRACE_PROBE(n) \
45 (offsetof(struct trace_probe, args) + \ 50 (offsetof(struct trace_probe, args) + \
46 (sizeof(struct probe_arg) * (n))) 51 (sizeof(struct probe_arg) * (n)))
@@ -150,6 +155,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
150 goto error; 155 goto error;
151 156
152 INIT_LIST_HEAD(&tp->list); 157 INIT_LIST_HEAD(&tp->list);
158 INIT_LIST_HEAD(&tp->files);
153 return tp; 159 return tp;
154error: 160error:
155 kfree(tp->call.name); 161 kfree(tp->call.name);
@@ -183,25 +189,6 @@ static struct trace_probe *find_trace_probe(const char *event,
183 return NULL; 189 return NULL;
184} 190}
185 191
186static int trace_probe_nr_files(struct trace_probe *tp)
187{
188 struct ftrace_event_file **file;
189 int ret = 0;
190
191 /*
192 * Since all tp->files updater is protected by probe_enable_lock,
193 * we don't need to lock an rcu_read_lock.
194 */
195 file = rcu_dereference_raw(tp->files);
196 if (file)
197 while (*(file++))
198 ret++;
199
200 return ret;
201}
202
203static DEFINE_MUTEX(probe_enable_lock);
204
205/* 192/*
206 * Enable trace_probe 193 * Enable trace_probe
207 * if the file is NULL, enable "perf" handler, or enable "trace" handler. 194 * if the file is NULL, enable "perf" handler, or enable "trace" handler.
@@ -211,67 +198,42 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
211{ 198{
212 int ret = 0; 199 int ret = 0;
213 200
214 mutex_lock(&probe_enable_lock);
215
216 if (file) { 201 if (file) {
217 struct ftrace_event_file **new, **old; 202 struct event_file_link *link;
218 int n = trace_probe_nr_files(tp); 203
219 204 link = kmalloc(sizeof(*link), GFP_KERNEL);
220 old = rcu_dereference_raw(tp->files); 205 if (!link) {
221 /* 1 is for new one and 1 is for stopper */
222 new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *),
223 GFP_KERNEL);
224 if (!new) {
225 ret = -ENOMEM; 206 ret = -ENOMEM;
226 goto out_unlock; 207 goto out;
227 } 208 }
228 memcpy(new, old, n * sizeof(struct ftrace_event_file *));
229 new[n] = file;
230 /* The last one keeps a NULL */
231 209
232 rcu_assign_pointer(tp->files, new); 210 link->file = file;
233 tp->flags |= TP_FLAG_TRACE; 211 list_add_tail_rcu(&link->list, &tp->files);
234 212
235 if (old) { 213 tp->flags |= TP_FLAG_TRACE;
236 /* Make sure the probe is done with old files */
237 synchronize_sched();
238 kfree(old);
239 }
240 } else 214 } else
241 tp->flags |= TP_FLAG_PROFILE; 215 tp->flags |= TP_FLAG_PROFILE;
242 216
243 if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && 217 if (trace_probe_is_registered(tp) && !trace_probe_has_gone(tp)) {
244 !trace_probe_has_gone(tp)) {
245 if (trace_probe_is_return(tp)) 218 if (trace_probe_is_return(tp))
246 ret = enable_kretprobe(&tp->rp); 219 ret = enable_kretprobe(&tp->rp);
247 else 220 else
248 ret = enable_kprobe(&tp->rp.kp); 221 ret = enable_kprobe(&tp->rp.kp);
249 } 222 }
250 223 out:
251 out_unlock:
252 mutex_unlock(&probe_enable_lock);
253
254 return ret; 224 return ret;
255} 225}
256 226
257static int 227static struct event_file_link *
258trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file) 228find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
259{ 229{
260 struct ftrace_event_file **files; 230 struct event_file_link *link;
261 int i;
262 231
263 /* 232 list_for_each_entry(link, &tp->files, list)
264 * Since all tp->files updater is protected by probe_enable_lock, 233 if (link->file == file)
265 * we don't need to lock an rcu_read_lock. 234 return link;
266 */
267 files = rcu_dereference_raw(tp->files);
268 if (files) {
269 for (i = 0; files[i]; i++)
270 if (files[i] == file)
271 return i;
272 }
273 235
274 return -1; 236 return NULL;
275} 237}
276 238
277/* 239/*
@@ -283,41 +245,24 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
283{ 245{
284 int ret = 0; 246 int ret = 0;
285 247
286 mutex_lock(&probe_enable_lock);
287
288 if (file) { 248 if (file) {
289 struct ftrace_event_file **new, **old; 249 struct event_file_link *link;
290 int n = trace_probe_nr_files(tp);
291 int i, j;
292 250
293 old = rcu_dereference_raw(tp->files); 251 link = find_event_file_link(tp, file);
294 if (n == 0 || trace_probe_file_index(tp, file) < 0) { 252 if (!link) {
295 ret = -EINVAL; 253 ret = -EINVAL;
296 goto out_unlock; 254 goto out;
297 } 255 }
298 256
299 if (n == 1) { /* Remove the last file */ 257 list_del_rcu(&link->list);
300 tp->flags &= ~TP_FLAG_TRACE; 258 /* synchronize with kprobe_trace_func/kretprobe_trace_func */
301 new = NULL; 259 synchronize_sched();
302 } else { 260 kfree(link);
303 new = kzalloc(n * sizeof(struct ftrace_event_file *),
304 GFP_KERNEL);
305 if (!new) {
306 ret = -ENOMEM;
307 goto out_unlock;
308 }
309
310 /* This copy & check loop copies the NULL stopper too */
311 for (i = 0, j = 0; j < n && i < n + 1; i++)
312 if (old[i] != file)
313 new[j++] = old[i];
314 }
315 261
316 rcu_assign_pointer(tp->files, new); 262 if (!list_empty(&tp->files))
263 goto out;
317 264
318 /* Make sure the probe is done with old files */ 265 tp->flags &= ~TP_FLAG_TRACE;
319 synchronize_sched();
320 kfree(old);
321 } else 266 } else
322 tp->flags &= ~TP_FLAG_PROFILE; 267 tp->flags &= ~TP_FLAG_PROFILE;
323 268
@@ -327,10 +272,7 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
327 else 272 else
328 disable_kprobe(&tp->rp.kp); 273 disable_kprobe(&tp->rp.kp);
329 } 274 }
330 275 out:
331 out_unlock:
332 mutex_unlock(&probe_enable_lock);
333
334 return ret; 276 return ret;
335} 277}
336 278
@@ -885,20 +827,10 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
885static __kprobes void 827static __kprobes void
886kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) 828kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs)
887{ 829{
888 /* 830 struct event_file_link *link;
889 * Note: preempt is already disabled around the kprobe handler.
890 * However, we still need an smp_read_barrier_depends() corresponding
891 * to smp_wmb() in rcu_assign_pointer() to access the pointer.
892 */
893 struct ftrace_event_file **file = rcu_dereference_raw(tp->files);
894
895 if (unlikely(!file))
896 return;
897 831
898 while (*file) { 832 list_for_each_entry_rcu(link, &tp->files, list)
899 __kprobe_trace_func(tp, regs, *file); 833 __kprobe_trace_func(tp, regs, link->file);
900 file++;
901 }
902} 834}
903 835
904/* Kretprobe handler */ 836/* Kretprobe handler */
@@ -945,20 +877,10 @@ static __kprobes void
945kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, 877kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
946 struct pt_regs *regs) 878 struct pt_regs *regs)
947{ 879{
948 /* 880 struct event_file_link *link;
949 * Note: preempt is already disabled around the kprobe handler.
950 * However, we still need an smp_read_barrier_depends() corresponding
951 * to smp_wmb() in rcu_assign_pointer() to access the pointer.
952 */
953 struct ftrace_event_file **file = rcu_dereference_raw(tp->files);
954 881
955 if (unlikely(!file)) 882 list_for_each_entry_rcu(link, &tp->files, list)
956 return; 883 __kretprobe_trace_func(tp, ri, regs, link->file);
957
958 while (*file) {
959 __kretprobe_trace_func(tp, ri, regs, *file);
960 file++;
961 }
962} 884}
963 885
964/* Event entry printers */ 886/* Event entry printers */
@@ -1157,6 +1079,10 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)
1157 int size, __size, dsize; 1079 int size, __size, dsize;
1158 int rctx; 1080 int rctx;
1159 1081
1082 head = this_cpu_ptr(call->perf_events);
1083 if (hlist_empty(head))
1084 return;
1085
1160 dsize = __get_data_size(tp, regs); 1086 dsize = __get_data_size(tp, regs);
1161 __size = sizeof(*entry) + tp->size + dsize; 1087 __size = sizeof(*entry) + tp->size + dsize;
1162 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1088 size = ALIGN(__size + sizeof(u32), sizeof(u64));
@@ -1172,10 +1098,7 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)
1172 entry->ip = (unsigned long)tp->rp.kp.addr; 1098 entry->ip = (unsigned long)tp->rp.kp.addr;
1173 memset(&entry[1], 0, dsize); 1099 memset(&entry[1], 0, dsize);
1174 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1100 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1175 1101 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
1176 head = this_cpu_ptr(call->perf_events);
1177 perf_trace_buf_submit(entry, size, rctx,
1178 entry->ip, 1, regs, head, NULL);
1179} 1102}
1180 1103
1181/* Kretprobe profile handler */ 1104/* Kretprobe profile handler */
@@ -1189,6 +1112,10 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,
1189 int size, __size, dsize; 1112 int size, __size, dsize;
1190 int rctx; 1113 int rctx;
1191 1114
1115 head = this_cpu_ptr(call->perf_events);
1116 if (hlist_empty(head))
1117 return;
1118
1192 dsize = __get_data_size(tp, regs); 1119 dsize = __get_data_size(tp, regs);
1193 __size = sizeof(*entry) + tp->size + dsize; 1120 __size = sizeof(*entry) + tp->size + dsize;
1194 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1121 size = ALIGN(__size + sizeof(u32), sizeof(u64));
@@ -1204,13 +1131,16 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,
1204 entry->func = (unsigned long)tp->rp.kp.addr; 1131 entry->func = (unsigned long)tp->rp.kp.addr;
1205 entry->ret_ip = (unsigned long)ri->ret_addr; 1132 entry->ret_ip = (unsigned long)ri->ret_addr;
1206 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1133 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1207 1134 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
1208 head = this_cpu_ptr(call->perf_events);
1209 perf_trace_buf_submit(entry, size, rctx,
1210 entry->ret_ip, 1, regs, head, NULL);
1211} 1135}
1212#endif /* CONFIG_PERF_EVENTS */ 1136#endif /* CONFIG_PERF_EVENTS */
1213 1137
1138/*
1139 * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex.
1140 *
1141 * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe
1142 * lockless, but we can't race with this __init function.
1143 */
1214static __kprobes 1144static __kprobes
1215int kprobe_register(struct ftrace_event_call *event, 1145int kprobe_register(struct ftrace_event_call *event,
1216 enum trace_reg type, void *data) 1146 enum trace_reg type, void *data)
@@ -1376,6 +1306,10 @@ find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr)
1376 return NULL; 1306 return NULL;
1377} 1307}
1378 1308
1309/*
1310 * Nobody but us can call enable_trace_probe/disable_trace_probe at this
1311 * stage, we can do this lockless.
1312 */
1379static __init int kprobe_trace_self_tests_init(void) 1313static __init int kprobe_trace_self_tests_init(void)
1380{ 1314{
1381 int ret, warn = 0; 1315 int ret, warn = 0;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 2901e3b88590..a7329b7902f8 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -640,13 +640,20 @@ out:
640 * Enable ftrace, sleep 1/10 second, and then read the trace 640 * Enable ftrace, sleep 1/10 second, and then read the trace
641 * buffer to see if all is in order. 641 * buffer to see if all is in order.
642 */ 642 */
643int 643__init int
644trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) 644trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
645{ 645{
646 int save_ftrace_enabled = ftrace_enabled; 646 int save_ftrace_enabled = ftrace_enabled;
647 unsigned long count; 647 unsigned long count;
648 int ret; 648 int ret;
649 649
650#ifdef CONFIG_DYNAMIC_FTRACE
651 if (ftrace_filter_param) {
652 printk(KERN_CONT " ... kernel command line filter set: force PASS ... ");
653 return 0;
654 }
655#endif
656
650 /* make sure msleep has been recorded */ 657 /* make sure msleep has been recorded */
651 msleep(1); 658 msleep(1);
652 659
@@ -727,13 +734,20 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
727 * Pretty much the same than for the function tracer from which the selftest 734 * Pretty much the same than for the function tracer from which the selftest
728 * has been borrowed. 735 * has been borrowed.
729 */ 736 */
730int 737__init int
731trace_selftest_startup_function_graph(struct tracer *trace, 738trace_selftest_startup_function_graph(struct tracer *trace,
732 struct trace_array *tr) 739 struct trace_array *tr)
733{ 740{
734 int ret; 741 int ret;
735 unsigned long count; 742 unsigned long count;
736 743
744#ifdef CONFIG_DYNAMIC_FTRACE
745 if (ftrace_filter_param) {
746 printk(KERN_CONT " ... kernel command line filter set: force PASS ... ");
747 return 0;
748 }
749#endif
750
737 /* 751 /*
738 * Simulate the init() callback but we attach a watchdog callback 752 * Simulate the init() callback but we attach a watchdog callback
739 * to detect and recover from possible hangs 753 * to detect and recover from possible hangs
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8f2ac73c7a5f..322e16461072 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -306,6 +306,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
306 struct syscall_metadata *sys_data; 306 struct syscall_metadata *sys_data;
307 struct ring_buffer_event *event; 307 struct ring_buffer_event *event;
308 struct ring_buffer *buffer; 308 struct ring_buffer *buffer;
309 unsigned long irq_flags;
310 int pc;
309 int syscall_nr; 311 int syscall_nr;
310 int size; 312 int size;
311 313
@@ -321,9 +323,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
321 323
322 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 324 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
323 325
326 local_save_flags(irq_flags);
327 pc = preempt_count();
328
324 buffer = tr->trace_buffer.buffer; 329 buffer = tr->trace_buffer.buffer;
325 event = trace_buffer_lock_reserve(buffer, 330 event = trace_buffer_lock_reserve(buffer,
326 sys_data->enter_event->event.type, size, 0, 0); 331 sys_data->enter_event->event.type, size, irq_flags, pc);
327 if (!event) 332 if (!event)
328 return; 333 return;
329 334
@@ -333,7 +338,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
333 338
334 if (!filter_current_check_discard(buffer, sys_data->enter_event, 339 if (!filter_current_check_discard(buffer, sys_data->enter_event,
335 entry, event)) 340 entry, event))
336 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 341 trace_current_buffer_unlock_commit(buffer, event,
342 irq_flags, pc);
337} 343}
338 344
339static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) 345static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
@@ -343,6 +349,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
343 struct syscall_metadata *sys_data; 349 struct syscall_metadata *sys_data;
344 struct ring_buffer_event *event; 350 struct ring_buffer_event *event;
345 struct ring_buffer *buffer; 351 struct ring_buffer *buffer;
352 unsigned long irq_flags;
353 int pc;
346 int syscall_nr; 354 int syscall_nr;
347 355
348 syscall_nr = trace_get_syscall_nr(current, regs); 356 syscall_nr = trace_get_syscall_nr(current, regs);
@@ -355,9 +363,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
355 if (!sys_data) 363 if (!sys_data)
356 return; 364 return;
357 365
366 local_save_flags(irq_flags);
367 pc = preempt_count();
368
358 buffer = tr->trace_buffer.buffer; 369 buffer = tr->trace_buffer.buffer;
359 event = trace_buffer_lock_reserve(buffer, 370 event = trace_buffer_lock_reserve(buffer,
360 sys_data->exit_event->event.type, sizeof(*entry), 0, 0); 371 sys_data->exit_event->event.type, sizeof(*entry),
372 irq_flags, pc);
361 if (!event) 373 if (!event)
362 return; 374 return;
363 375
@@ -367,7 +379,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
367 379
368 if (!filter_current_check_discard(buffer, sys_data->exit_event, 380 if (!filter_current_check_discard(buffer, sys_data->exit_event,
369 entry, event)) 381 entry, event))
370 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 382 trace_current_buffer_unlock_commit(buffer, event,
383 irq_flags, pc);
371} 384}
372 385
373static int reg_event_syscall_enter(struct ftrace_event_file *file, 386static int reg_event_syscall_enter(struct ftrace_event_file *file,
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 32494fb0ee64..d5d0cd368a56 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -283,8 +283,10 @@ static int create_trace_uprobe(int argc, char **argv)
283 return -EINVAL; 283 return -EINVAL;
284 } 284 }
285 arg = strchr(argv[1], ':'); 285 arg = strchr(argv[1], ':');
286 if (!arg) 286 if (!arg) {
287 ret = -EINVAL;
287 goto fail_address_parse; 288 goto fail_address_parse;
289 }
288 290
289 *arg++ = '\0'; 291 *arg++ = '\0';
290 filename = argv[1]; 292 filename = argv[1];
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ee8e29a2320c..f02c4a4a0c3c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -272,6 +272,15 @@ static cpumask_var_t *wq_numa_possible_cpumask;
272static bool wq_disable_numa; 272static bool wq_disable_numa;
273module_param_named(disable_numa, wq_disable_numa, bool, 0444); 273module_param_named(disable_numa, wq_disable_numa, bool, 0444);
274 274
275/* see the comment above the definition of WQ_POWER_EFFICIENT */
276#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT
277static bool wq_power_efficient = true;
278#else
279static bool wq_power_efficient;
280#endif
281
282module_param_named(power_efficient, wq_power_efficient, bool, 0444);
283
275static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ 284static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
276 285
277/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ 286/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
@@ -305,6 +314,10 @@ struct workqueue_struct *system_unbound_wq __read_mostly;
305EXPORT_SYMBOL_GPL(system_unbound_wq); 314EXPORT_SYMBOL_GPL(system_unbound_wq);
306struct workqueue_struct *system_freezable_wq __read_mostly; 315struct workqueue_struct *system_freezable_wq __read_mostly;
307EXPORT_SYMBOL_GPL(system_freezable_wq); 316EXPORT_SYMBOL_GPL(system_freezable_wq);
317struct workqueue_struct *system_power_efficient_wq __read_mostly;
318EXPORT_SYMBOL_GPL(system_power_efficient_wq);
319struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
320EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
308 321
309static int worker_thread(void *__worker); 322static int worker_thread(void *__worker);
310static void copy_workqueue_attrs(struct workqueue_attrs *to, 323static void copy_workqueue_attrs(struct workqueue_attrs *to,
@@ -4086,6 +4099,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
4086 struct workqueue_struct *wq; 4099 struct workqueue_struct *wq;
4087 struct pool_workqueue *pwq; 4100 struct pool_workqueue *pwq;
4088 4101
4102 /* see the comment above the definition of WQ_POWER_EFFICIENT */
4103 if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
4104 flags |= WQ_UNBOUND;
4105
4089 /* allocate wq and format name */ 4106 /* allocate wq and format name */
4090 if (flags & WQ_UNBOUND) 4107 if (flags & WQ_UNBOUND)
4091 tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); 4108 tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
@@ -4985,8 +5002,15 @@ static int __init init_workqueues(void)
4985 WQ_UNBOUND_MAX_ACTIVE); 5002 WQ_UNBOUND_MAX_ACTIVE);
4986 system_freezable_wq = alloc_workqueue("events_freezable", 5003 system_freezable_wq = alloc_workqueue("events_freezable",
4987 WQ_FREEZABLE, 0); 5004 WQ_FREEZABLE, 0);
5005 system_power_efficient_wq = alloc_workqueue("events_power_efficient",
5006 WQ_POWER_EFFICIENT, 0);
5007 system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
5008 WQ_FREEZABLE | WQ_POWER_EFFICIENT,
5009 0);
4988 BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || 5010 BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
4989 !system_unbound_wq || !system_freezable_wq); 5011 !system_unbound_wq || !system_freezable_wq ||
5012 !system_power_efficient_wq ||
5013 !system_freezable_power_efficient_wq);
4990 return 0; 5014 return 0;
4991} 5015}
4992early_initcall(init_workqueues); 5016early_initcall(init_workqueues);