diff options
author | Thomas Gleixner <tglx@linutronix.de> | 2013-07-12 06:34:42 -0400 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2013-07-12 06:34:42 -0400 |
commit | f2006e27396f55276f24434f56e208d86e7f9908 (patch) | |
tree | 71896db916d33888b4286f80117d3cac0da40e6d /kernel | |
parent | e399eb56a6110e13f97e644658648602e2b08de7 (diff) | |
parent | 9903883f1dd6e86f286b7bfa6e4b423f98c1cd9e (diff) |
Merge branch 'linus' into timers/urgent
Get upstream changes so we can apply fixes against them
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel')
62 files changed, 3927 insertions, 2553 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 271fd3119af9..470839d1a30e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ | |||
9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o extable.o params.o posix-timers.o \ |
10 | kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
12 | notifier.o ksysfs.o cred.o \ | 12 | notifier.o ksysfs.o cred.o reboot.o \ |
13 | async.o range.o groups.o lglock.o smpboot.o | 13 | async.o range.o groups.o lglock.o smpboot.o |
14 | 14 | ||
15 | ifdef CONFIG_FUNCTION_TRACER | 15 | ifdef CONFIG_FUNCTION_TRACER |
diff --git a/kernel/audit.h b/kernel/audit.h index 1c95131ef760..123c9b7c3979 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
@@ -85,6 +85,7 @@ struct audit_names { | |||
85 | 85 | ||
86 | struct filename *name; | 86 | struct filename *name; |
87 | int name_len; /* number of chars to log */ | 87 | int name_len; /* number of chars to log */ |
88 | bool hidden; /* don't log this record */ | ||
88 | bool name_put; /* call __putname()? */ | 89 | bool name_put; /* call __putname()? */ |
89 | 90 | ||
90 | unsigned long ino; | 91 | unsigned long ino; |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 6bd4a90d1991..f7aee8be7fb2 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -423,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
423 | f->lsm_rule = NULL; | 423 | f->lsm_rule = NULL; |
424 | 424 | ||
425 | /* Support legacy tests for a valid loginuid */ | 425 | /* Support legacy tests for a valid loginuid */ |
426 | if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) { | 426 | if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) { |
427 | f->type = AUDIT_LOGINUID_SET; | 427 | f->type = AUDIT_LOGINUID_SET; |
428 | f->val = 0; | 428 | f->val = 0; |
429 | } | 429 | } |
@@ -865,6 +865,12 @@ static inline int audit_add_rule(struct audit_entry *entry) | |||
865 | err = audit_add_watch(&entry->rule, &list); | 865 | err = audit_add_watch(&entry->rule, &list); |
866 | if (err) { | 866 | if (err) { |
867 | mutex_unlock(&audit_filter_mutex); | 867 | mutex_unlock(&audit_filter_mutex); |
868 | /* | ||
869 | * normally audit_add_tree_rule() will free it | ||
870 | * on failure | ||
871 | */ | ||
872 | if (tree) | ||
873 | audit_put_tree(tree); | ||
868 | goto error; | 874 | goto error; |
869 | } | 875 | } |
870 | } | 876 | } |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3c8a601324a2..9845cb32b60a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -1399,8 +1399,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
1399 | } | 1399 | } |
1400 | 1400 | ||
1401 | i = 0; | 1401 | i = 0; |
1402 | list_for_each_entry(n, &context->names_list, list) | 1402 | list_for_each_entry(n, &context->names_list, list) { |
1403 | if (n->hidden) | ||
1404 | continue; | ||
1403 | audit_log_name(context, n, NULL, i++, &call_panic); | 1405 | audit_log_name(context, n, NULL, i++, &call_panic); |
1406 | } | ||
1404 | 1407 | ||
1405 | /* Send end of event record to help user space know we are finished */ | 1408 | /* Send end of event record to help user space know we are finished */ |
1406 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); | 1409 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); |
@@ -1769,14 +1772,15 @@ void audit_putname(struct filename *name) | |||
1769 | * __audit_inode - store the inode and device from a lookup | 1772 | * __audit_inode - store the inode and device from a lookup |
1770 | * @name: name being audited | 1773 | * @name: name being audited |
1771 | * @dentry: dentry being audited | 1774 | * @dentry: dentry being audited |
1772 | * @parent: does this dentry represent the parent? | 1775 | * @flags: attributes for this particular entry |
1773 | */ | 1776 | */ |
1774 | void __audit_inode(struct filename *name, const struct dentry *dentry, | 1777 | void __audit_inode(struct filename *name, const struct dentry *dentry, |
1775 | unsigned int parent) | 1778 | unsigned int flags) |
1776 | { | 1779 | { |
1777 | struct audit_context *context = current->audit_context; | 1780 | struct audit_context *context = current->audit_context; |
1778 | const struct inode *inode = dentry->d_inode; | 1781 | const struct inode *inode = dentry->d_inode; |
1779 | struct audit_names *n; | 1782 | struct audit_names *n; |
1783 | bool parent = flags & AUDIT_INODE_PARENT; | ||
1780 | 1784 | ||
1781 | if (!context->in_syscall) | 1785 | if (!context->in_syscall) |
1782 | return; | 1786 | return; |
@@ -1831,6 +1835,8 @@ out: | |||
1831 | if (parent) { | 1835 | if (parent) { |
1832 | n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; | 1836 | n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; |
1833 | n->type = AUDIT_TYPE_PARENT; | 1837 | n->type = AUDIT_TYPE_PARENT; |
1838 | if (flags & AUDIT_INODE_HIDDEN) | ||
1839 | n->hidden = true; | ||
1834 | } else { | 1840 | } else { |
1835 | n->name_len = AUDIT_NAME_FULL; | 1841 | n->name_len = AUDIT_NAME_FULL; |
1836 | n->type = AUDIT_TYPE_NORMAL; | 1842 | n->type = AUDIT_TYPE_NORMAL; |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a7c9e6ddb979..e5583d10a325 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -63,9 +63,6 @@ | |||
63 | 63 | ||
64 | #include <linux/atomic.h> | 64 | #include <linux/atomic.h> |
65 | 65 | ||
66 | /* css deactivation bias, makes css->refcnt negative to deny new trygets */ | ||
67 | #define CSS_DEACT_BIAS INT_MIN | ||
68 | |||
69 | /* | 66 | /* |
70 | * cgroup_mutex is the master lock. Any modification to cgroup or its | 67 | * cgroup_mutex is the master lock. Any modification to cgroup or its |
71 | * hierarchy must be performed while holding it. | 68 | * hierarchy must be performed while holding it. |
@@ -99,16 +96,19 @@ static DEFINE_MUTEX(cgroup_root_mutex); | |||
99 | */ | 96 | */ |
100 | #define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, | 97 | #define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, |
101 | #define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) | 98 | #define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) |
102 | static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { | 99 | static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = { |
103 | #include <linux/cgroup_subsys.h> | 100 | #include <linux/cgroup_subsys.h> |
104 | }; | 101 | }; |
105 | 102 | ||
106 | /* | 103 | /* |
107 | * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the | 104 | * The dummy hierarchy, reserved for the subsystems that are otherwise |
108 | * subsystems that are otherwise unattached - it never has more than a | 105 | * unattached - it never has more than a single cgroup, and all tasks are |
109 | * single cgroup, and all tasks are part of that cgroup. | 106 | * part of that cgroup. |
110 | */ | 107 | */ |
111 | static struct cgroupfs_root rootnode; | 108 | static struct cgroupfs_root cgroup_dummy_root; |
109 | |||
110 | /* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ | ||
111 | static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; | ||
112 | 112 | ||
113 | /* | 113 | /* |
114 | * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. | 114 | * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. |
@@ -186,18 +186,28 @@ struct cgroup_event { | |||
186 | 186 | ||
187 | /* The list of hierarchy roots */ | 187 | /* The list of hierarchy roots */ |
188 | 188 | ||
189 | static LIST_HEAD(roots); | 189 | static LIST_HEAD(cgroup_roots); |
190 | static int root_count; | 190 | static int cgroup_root_count; |
191 | 191 | ||
192 | static DEFINE_IDA(hierarchy_ida); | 192 | /* |
193 | static int next_hierarchy_id; | 193 | * Hierarchy ID allocation and mapping. It follows the same exclusion |
194 | static DEFINE_SPINLOCK(hierarchy_id_lock); | 194 | * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for |
195 | 195 | * writes, either for reads. | |
196 | /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ | 196 | */ |
197 | #define dummytop (&rootnode.top_cgroup) | 197 | static DEFINE_IDR(cgroup_hierarchy_idr); |
198 | 198 | ||
199 | static struct cgroup_name root_cgroup_name = { .name = "/" }; | 199 | static struct cgroup_name root_cgroup_name = { .name = "/" }; |
200 | 200 | ||
201 | /* | ||
202 | * Assign a monotonically increasing serial number to cgroups. It | ||
203 | * guarantees cgroups with bigger numbers are newer than those with smaller | ||
204 | * numbers. Also, as cgroups are always appended to the parent's | ||
205 | * ->children list, it guarantees that sibling cgroups are always sorted in | ||
206 | * the ascending serial number order on the list. Protected by | ||
207 | * cgroup_mutex. | ||
208 | */ | ||
209 | static u64 cgroup_serial_nr_next = 1; | ||
210 | |||
201 | /* This flag indicates whether tasks in the fork and exit paths should | 211 | /* This flag indicates whether tasks in the fork and exit paths should |
202 | * check for fork/exit handlers to call. This avoids us having to do | 212 | * check for fork/exit handlers to call. This avoids us having to do |
203 | * extra work in the fork/exit path if none of the subsystems need to | 213 | * extra work in the fork/exit path if none of the subsystems need to |
@@ -205,27 +215,15 @@ static struct cgroup_name root_cgroup_name = { .name = "/" }; | |||
205 | */ | 215 | */ |
206 | static int need_forkexit_callback __read_mostly; | 216 | static int need_forkexit_callback __read_mostly; |
207 | 217 | ||
218 | static void cgroup_offline_fn(struct work_struct *work); | ||
208 | static int cgroup_destroy_locked(struct cgroup *cgrp); | 219 | static int cgroup_destroy_locked(struct cgroup *cgrp); |
209 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 220 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
210 | struct cftype cfts[], bool is_add); | 221 | struct cftype cfts[], bool is_add); |
211 | 222 | ||
212 | static int css_unbias_refcnt(int refcnt) | ||
213 | { | ||
214 | return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; | ||
215 | } | ||
216 | |||
217 | /* the current nr of refs, always >= 0 whether @css is deactivated or not */ | ||
218 | static int css_refcnt(struct cgroup_subsys_state *css) | ||
219 | { | ||
220 | int v = atomic_read(&css->refcnt); | ||
221 | |||
222 | return css_unbias_refcnt(v); | ||
223 | } | ||
224 | |||
225 | /* convenient tests for these bits */ | 223 | /* convenient tests for these bits */ |
226 | inline int cgroup_is_removed(const struct cgroup *cgrp) | 224 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) |
227 | { | 225 | { |
228 | return test_bit(CGRP_REMOVED, &cgrp->flags); | 226 | return test_bit(CGRP_DEAD, &cgrp->flags); |
229 | } | 227 | } |
230 | 228 | ||
231 | /** | 229 | /** |
@@ -261,16 +259,38 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
261 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 259 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
262 | } | 260 | } |
263 | 261 | ||
264 | /* | 262 | /** |
265 | * for_each_subsys() allows you to iterate on each subsystem attached to | 263 | * for_each_subsys - iterate all loaded cgroup subsystems |
266 | * an active hierarchy | 264 | * @ss: the iteration cursor |
265 | * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end | ||
266 | * | ||
267 | * Should be called under cgroup_mutex. | ||
267 | */ | 268 | */ |
268 | #define for_each_subsys(_root, _ss) \ | 269 | #define for_each_subsys(ss, i) \ |
269 | list_for_each_entry(_ss, &_root->subsys_list, sibling) | 270 | for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \ |
271 | if (({ lockdep_assert_held(&cgroup_mutex); \ | ||
272 | !((ss) = cgroup_subsys[i]); })) { } \ | ||
273 | else | ||
274 | |||
275 | /** | ||
276 | * for_each_builtin_subsys - iterate all built-in cgroup subsystems | ||
277 | * @ss: the iteration cursor | ||
278 | * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end | ||
279 | * | ||
280 | * Bulit-in subsystems are always present and iteration itself doesn't | ||
281 | * require any synchronization. | ||
282 | */ | ||
283 | #define for_each_builtin_subsys(ss, i) \ | ||
284 | for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ | ||
285 | (((ss) = cgroup_subsys[i]) || true); (i)++) | ||
286 | |||
287 | /* iterate each subsystem attached to a hierarchy */ | ||
288 | #define for_each_root_subsys(root, ss) \ | ||
289 | list_for_each_entry((ss), &(root)->subsys_list, sibling) | ||
270 | 290 | ||
271 | /* for_each_active_root() allows you to iterate across the active hierarchies */ | 291 | /* iterate across the active hierarchies */ |
272 | #define for_each_active_root(_root) \ | 292 | #define for_each_active_root(root) \ |
273 | list_for_each_entry(_root, &roots, root_list) | 293 | list_for_each_entry((root), &cgroup_roots, root_list) |
274 | 294 | ||
275 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) | 295 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) |
276 | { | 296 | { |
@@ -297,7 +317,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) | |||
297 | static bool cgroup_lock_live_group(struct cgroup *cgrp) | 317 | static bool cgroup_lock_live_group(struct cgroup *cgrp) |
298 | { | 318 | { |
299 | mutex_lock(&cgroup_mutex); | 319 | mutex_lock(&cgroup_mutex); |
300 | if (cgroup_is_removed(cgrp)) { | 320 | if (cgroup_is_dead(cgrp)) { |
301 | mutex_unlock(&cgroup_mutex); | 321 | mutex_unlock(&cgroup_mutex); |
302 | return false; | 322 | return false; |
303 | } | 323 | } |
@@ -312,20 +332,24 @@ static void cgroup_release_agent(struct work_struct *work); | |||
312 | static DECLARE_WORK(release_agent_work, cgroup_release_agent); | 332 | static DECLARE_WORK(release_agent_work, cgroup_release_agent); |
313 | static void check_for_release(struct cgroup *cgrp); | 333 | static void check_for_release(struct cgroup *cgrp); |
314 | 334 | ||
315 | /* Link structure for associating css_set objects with cgroups */ | 335 | /* |
316 | struct cg_cgroup_link { | 336 | * A cgroup can be associated with multiple css_sets as different tasks may |
317 | /* | 337 | * belong to different cgroups on different hierarchies. In the other |
318 | * List running through cg_cgroup_links associated with a | 338 | * direction, a css_set is naturally associated with multiple cgroups. |
319 | * cgroup, anchored on cgroup->css_sets | 339 | * This M:N relationship is represented by the following link structure |
320 | */ | 340 | * which exists for each association and allows traversing the associations |
321 | struct list_head cgrp_link_list; | 341 | * from both sides. |
322 | struct cgroup *cgrp; | 342 | */ |
323 | /* | 343 | struct cgrp_cset_link { |
324 | * List running through cg_cgroup_links pointing at a | 344 | /* the cgroup and css_set this link associates */ |
325 | * single css_set object, anchored on css_set->cg_links | 345 | struct cgroup *cgrp; |
326 | */ | 346 | struct css_set *cset; |
327 | struct list_head cg_link_list; | 347 | |
328 | struct css_set *cg; | 348 | /* list of cgrp_cset_links anchored at cgrp->cset_links */ |
349 | struct list_head cset_link; | ||
350 | |||
351 | /* list of cgrp_cset_links anchored at css_set->cgrp_links */ | ||
352 | struct list_head cgrp_link; | ||
329 | }; | 353 | }; |
330 | 354 | ||
331 | /* The default css_set - used by init and its children prior to any | 355 | /* The default css_set - used by init and its children prior to any |
@@ -336,7 +360,7 @@ struct cg_cgroup_link { | |||
336 | */ | 360 | */ |
337 | 361 | ||
338 | static struct css_set init_css_set; | 362 | static struct css_set init_css_set; |
339 | static struct cg_cgroup_link init_css_set_link; | 363 | static struct cgrp_cset_link init_cgrp_cset_link; |
340 | 364 | ||
341 | static int cgroup_init_idr(struct cgroup_subsys *ss, | 365 | static int cgroup_init_idr(struct cgroup_subsys *ss, |
342 | struct cgroup_subsys_state *css); | 366 | struct cgroup_subsys_state *css); |
@@ -357,10 +381,11 @@ static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); | |||
357 | 381 | ||
358 | static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) | 382 | static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) |
359 | { | 383 | { |
360 | int i; | ||
361 | unsigned long key = 0UL; | 384 | unsigned long key = 0UL; |
385 | struct cgroup_subsys *ss; | ||
386 | int i; | ||
362 | 387 | ||
363 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) | 388 | for_each_subsys(ss, i) |
364 | key += (unsigned long)css[i]; | 389 | key += (unsigned long)css[i]; |
365 | key = (key >> 16) ^ key; | 390 | key = (key >> 16) ^ key; |
366 | 391 | ||
@@ -373,90 +398,83 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) | |||
373 | * compiled into their kernel but not actually in use */ | 398 | * compiled into their kernel but not actually in use */ |
374 | static int use_task_css_set_links __read_mostly; | 399 | static int use_task_css_set_links __read_mostly; |
375 | 400 | ||
376 | static void __put_css_set(struct css_set *cg, int taskexit) | 401 | static void __put_css_set(struct css_set *cset, int taskexit) |
377 | { | 402 | { |
378 | struct cg_cgroup_link *link; | 403 | struct cgrp_cset_link *link, *tmp_link; |
379 | struct cg_cgroup_link *saved_link; | 404 | |
380 | /* | 405 | /* |
381 | * Ensure that the refcount doesn't hit zero while any readers | 406 | * Ensure that the refcount doesn't hit zero while any readers |
382 | * can see it. Similar to atomic_dec_and_lock(), but for an | 407 | * can see it. Similar to atomic_dec_and_lock(), but for an |
383 | * rwlock | 408 | * rwlock |
384 | */ | 409 | */ |
385 | if (atomic_add_unless(&cg->refcount, -1, 1)) | 410 | if (atomic_add_unless(&cset->refcount, -1, 1)) |
386 | return; | 411 | return; |
387 | write_lock(&css_set_lock); | 412 | write_lock(&css_set_lock); |
388 | if (!atomic_dec_and_test(&cg->refcount)) { | 413 | if (!atomic_dec_and_test(&cset->refcount)) { |
389 | write_unlock(&css_set_lock); | 414 | write_unlock(&css_set_lock); |
390 | return; | 415 | return; |
391 | } | 416 | } |
392 | 417 | ||
393 | /* This css_set is dead. unlink it and release cgroup refcounts */ | 418 | /* This css_set is dead. unlink it and release cgroup refcounts */ |
394 | hash_del(&cg->hlist); | 419 | hash_del(&cset->hlist); |
395 | css_set_count--; | 420 | css_set_count--; |
396 | 421 | ||
397 | list_for_each_entry_safe(link, saved_link, &cg->cg_links, | 422 | list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { |
398 | cg_link_list) { | ||
399 | struct cgroup *cgrp = link->cgrp; | 423 | struct cgroup *cgrp = link->cgrp; |
400 | list_del(&link->cg_link_list); | ||
401 | list_del(&link->cgrp_link_list); | ||
402 | 424 | ||
403 | /* | 425 | list_del(&link->cset_link); |
404 | * We may not be holding cgroup_mutex, and if cgrp->count is | 426 | list_del(&link->cgrp_link); |
405 | * dropped to 0 the cgroup can be destroyed at any time, hence | 427 | |
406 | * rcu_read_lock is used to keep it alive. | 428 | /* @cgrp can't go away while we're holding css_set_lock */ |
407 | */ | 429 | if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { |
408 | rcu_read_lock(); | ||
409 | if (atomic_dec_and_test(&cgrp->count) && | ||
410 | notify_on_release(cgrp)) { | ||
411 | if (taskexit) | 430 | if (taskexit) |
412 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 431 | set_bit(CGRP_RELEASABLE, &cgrp->flags); |
413 | check_for_release(cgrp); | 432 | check_for_release(cgrp); |
414 | } | 433 | } |
415 | rcu_read_unlock(); | ||
416 | 434 | ||
417 | kfree(link); | 435 | kfree(link); |
418 | } | 436 | } |
419 | 437 | ||
420 | write_unlock(&css_set_lock); | 438 | write_unlock(&css_set_lock); |
421 | kfree_rcu(cg, rcu_head); | 439 | kfree_rcu(cset, rcu_head); |
422 | } | 440 | } |
423 | 441 | ||
424 | /* | 442 | /* |
425 | * refcounted get/put for css_set objects | 443 | * refcounted get/put for css_set objects |
426 | */ | 444 | */ |
427 | static inline void get_css_set(struct css_set *cg) | 445 | static inline void get_css_set(struct css_set *cset) |
428 | { | 446 | { |
429 | atomic_inc(&cg->refcount); | 447 | atomic_inc(&cset->refcount); |
430 | } | 448 | } |
431 | 449 | ||
432 | static inline void put_css_set(struct css_set *cg) | 450 | static inline void put_css_set(struct css_set *cset) |
433 | { | 451 | { |
434 | __put_css_set(cg, 0); | 452 | __put_css_set(cset, 0); |
435 | } | 453 | } |
436 | 454 | ||
437 | static inline void put_css_set_taskexit(struct css_set *cg) | 455 | static inline void put_css_set_taskexit(struct css_set *cset) |
438 | { | 456 | { |
439 | __put_css_set(cg, 1); | 457 | __put_css_set(cset, 1); |
440 | } | 458 | } |
441 | 459 | ||
442 | /* | 460 | /** |
443 | * compare_css_sets - helper function for find_existing_css_set(). | 461 | * compare_css_sets - helper function for find_existing_css_set(). |
444 | * @cg: candidate css_set being tested | 462 | * @cset: candidate css_set being tested |
445 | * @old_cg: existing css_set for a task | 463 | * @old_cset: existing css_set for a task |
446 | * @new_cgrp: cgroup that's being entered by the task | 464 | * @new_cgrp: cgroup that's being entered by the task |
447 | * @template: desired set of css pointers in css_set (pre-calculated) | 465 | * @template: desired set of css pointers in css_set (pre-calculated) |
448 | * | 466 | * |
449 | * Returns true if "cg" matches "old_cg" except for the hierarchy | 467 | * Returns true if "cg" matches "old_cg" except for the hierarchy |
450 | * which "new_cgrp" belongs to, for which it should match "new_cgrp". | 468 | * which "new_cgrp" belongs to, for which it should match "new_cgrp". |
451 | */ | 469 | */ |
452 | static bool compare_css_sets(struct css_set *cg, | 470 | static bool compare_css_sets(struct css_set *cset, |
453 | struct css_set *old_cg, | 471 | struct css_set *old_cset, |
454 | struct cgroup *new_cgrp, | 472 | struct cgroup *new_cgrp, |
455 | struct cgroup_subsys_state *template[]) | 473 | struct cgroup_subsys_state *template[]) |
456 | { | 474 | { |
457 | struct list_head *l1, *l2; | 475 | struct list_head *l1, *l2; |
458 | 476 | ||
459 | if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { | 477 | if (memcmp(template, cset->subsys, sizeof(cset->subsys))) { |
460 | /* Not all subsystems matched */ | 478 | /* Not all subsystems matched */ |
461 | return false; | 479 | return false; |
462 | } | 480 | } |
@@ -470,28 +488,28 @@ static bool compare_css_sets(struct css_set *cg, | |||
470 | * candidates. | 488 | * candidates. |
471 | */ | 489 | */ |
472 | 490 | ||
473 | l1 = &cg->cg_links; | 491 | l1 = &cset->cgrp_links; |
474 | l2 = &old_cg->cg_links; | 492 | l2 = &old_cset->cgrp_links; |
475 | while (1) { | 493 | while (1) { |
476 | struct cg_cgroup_link *cgl1, *cgl2; | 494 | struct cgrp_cset_link *link1, *link2; |
477 | struct cgroup *cg1, *cg2; | 495 | struct cgroup *cgrp1, *cgrp2; |
478 | 496 | ||
479 | l1 = l1->next; | 497 | l1 = l1->next; |
480 | l2 = l2->next; | 498 | l2 = l2->next; |
481 | /* See if we reached the end - both lists are equal length. */ | 499 | /* See if we reached the end - both lists are equal length. */ |
482 | if (l1 == &cg->cg_links) { | 500 | if (l1 == &cset->cgrp_links) { |
483 | BUG_ON(l2 != &old_cg->cg_links); | 501 | BUG_ON(l2 != &old_cset->cgrp_links); |
484 | break; | 502 | break; |
485 | } else { | 503 | } else { |
486 | BUG_ON(l2 == &old_cg->cg_links); | 504 | BUG_ON(l2 == &old_cset->cgrp_links); |
487 | } | 505 | } |
488 | /* Locate the cgroups associated with these links. */ | 506 | /* Locate the cgroups associated with these links. */ |
489 | cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); | 507 | link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link); |
490 | cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); | 508 | link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link); |
491 | cg1 = cgl1->cgrp; | 509 | cgrp1 = link1->cgrp; |
492 | cg2 = cgl2->cgrp; | 510 | cgrp2 = link2->cgrp; |
493 | /* Hierarchies should be linked in the same order. */ | 511 | /* Hierarchies should be linked in the same order. */ |
494 | BUG_ON(cg1->root != cg2->root); | 512 | BUG_ON(cgrp1->root != cgrp2->root); |
495 | 513 | ||
496 | /* | 514 | /* |
497 | * If this hierarchy is the hierarchy of the cgroup | 515 | * If this hierarchy is the hierarchy of the cgroup |
@@ -500,46 +518,39 @@ static bool compare_css_sets(struct css_set *cg, | |||
500 | * hierarchy, then this css_set should point to the | 518 | * hierarchy, then this css_set should point to the |
501 | * same cgroup as the old css_set. | 519 | * same cgroup as the old css_set. |
502 | */ | 520 | */ |
503 | if (cg1->root == new_cgrp->root) { | 521 | if (cgrp1->root == new_cgrp->root) { |
504 | if (cg1 != new_cgrp) | 522 | if (cgrp1 != new_cgrp) |
505 | return false; | 523 | return false; |
506 | } else { | 524 | } else { |
507 | if (cg1 != cg2) | 525 | if (cgrp1 != cgrp2) |
508 | return false; | 526 | return false; |
509 | } | 527 | } |
510 | } | 528 | } |
511 | return true; | 529 | return true; |
512 | } | 530 | } |
513 | 531 | ||
514 | /* | 532 | /** |
515 | * find_existing_css_set() is a helper for | 533 | * find_existing_css_set - init css array and find the matching css_set |
516 | * find_css_set(), and checks to see whether an existing | 534 | * @old_cset: the css_set that we're using before the cgroup transition |
517 | * css_set is suitable. | 535 | * @cgrp: the cgroup that we're moving into |
518 | * | 536 | * @template: out param for the new set of csses, should be clear on entry |
519 | * oldcg: the cgroup group that we're using before the cgroup | ||
520 | * transition | ||
521 | * | ||
522 | * cgrp: the cgroup that we're moving into | ||
523 | * | ||
524 | * template: location in which to build the desired set of subsystem | ||
525 | * state objects for the new cgroup group | ||
526 | */ | 537 | */ |
527 | static struct css_set *find_existing_css_set( | 538 | static struct css_set *find_existing_css_set(struct css_set *old_cset, |
528 | struct css_set *oldcg, | 539 | struct cgroup *cgrp, |
529 | struct cgroup *cgrp, | 540 | struct cgroup_subsys_state *template[]) |
530 | struct cgroup_subsys_state *template[]) | ||
531 | { | 541 | { |
532 | int i; | ||
533 | struct cgroupfs_root *root = cgrp->root; | 542 | struct cgroupfs_root *root = cgrp->root; |
534 | struct css_set *cg; | 543 | struct cgroup_subsys *ss; |
544 | struct css_set *cset; | ||
535 | unsigned long key; | 545 | unsigned long key; |
546 | int i; | ||
536 | 547 | ||
537 | /* | 548 | /* |
538 | * Build the set of subsystem state objects that we want to see in the | 549 | * Build the set of subsystem state objects that we want to see in the |
539 | * new css_set. while subsystems can change globally, the entries here | 550 | * new css_set. while subsystems can change globally, the entries here |
540 | * won't change, so no need for locking. | 551 | * won't change, so no need for locking. |
541 | */ | 552 | */ |
542 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 553 | for_each_subsys(ss, i) { |
543 | if (root->subsys_mask & (1UL << i)) { | 554 | if (root->subsys_mask & (1UL << i)) { |
544 | /* Subsystem is in this hierarchy. So we want | 555 | /* Subsystem is in this hierarchy. So we want |
545 | * the subsystem state from the new | 556 | * the subsystem state from the new |
@@ -548,148 +559,152 @@ static struct css_set *find_existing_css_set( | |||
548 | } else { | 559 | } else { |
549 | /* Subsystem is not in this hierarchy, so we | 560 | /* Subsystem is not in this hierarchy, so we |
550 | * don't want to change the subsystem state */ | 561 | * don't want to change the subsystem state */ |
551 | template[i] = oldcg->subsys[i]; | 562 | template[i] = old_cset->subsys[i]; |
552 | } | 563 | } |
553 | } | 564 | } |
554 | 565 | ||
555 | key = css_set_hash(template); | 566 | key = css_set_hash(template); |
556 | hash_for_each_possible(css_set_table, cg, hlist, key) { | 567 | hash_for_each_possible(css_set_table, cset, hlist, key) { |
557 | if (!compare_css_sets(cg, oldcg, cgrp, template)) | 568 | if (!compare_css_sets(cset, old_cset, cgrp, template)) |
558 | continue; | 569 | continue; |
559 | 570 | ||
560 | /* This css_set matches what we need */ | 571 | /* This css_set matches what we need */ |
561 | return cg; | 572 | return cset; |
562 | } | 573 | } |
563 | 574 | ||
564 | /* No existing cgroup group matched */ | 575 | /* No existing cgroup group matched */ |
565 | return NULL; | 576 | return NULL; |
566 | } | 577 | } |
567 | 578 | ||
568 | static void free_cg_links(struct list_head *tmp) | 579 | static void free_cgrp_cset_links(struct list_head *links_to_free) |
569 | { | 580 | { |
570 | struct cg_cgroup_link *link; | 581 | struct cgrp_cset_link *link, *tmp_link; |
571 | struct cg_cgroup_link *saved_link; | ||
572 | 582 | ||
573 | list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { | 583 | list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) { |
574 | list_del(&link->cgrp_link_list); | 584 | list_del(&link->cset_link); |
575 | kfree(link); | 585 | kfree(link); |
576 | } | 586 | } |
577 | } | 587 | } |
578 | 588 | ||
579 | /* | 589 | /** |
580 | * allocate_cg_links() allocates "count" cg_cgroup_link structures | 590 | * allocate_cgrp_cset_links - allocate cgrp_cset_links |
581 | * and chains them on tmp through their cgrp_link_list fields. Returns 0 on | 591 | * @count: the number of links to allocate |
582 | * success or a negative error | 592 | * @tmp_links: list_head the allocated links are put on |
593 | * | ||
594 | * Allocate @count cgrp_cset_link structures and chain them on @tmp_links | ||
595 | * through ->cset_link. Returns 0 on success or -errno. | ||
583 | */ | 596 | */ |
584 | static int allocate_cg_links(int count, struct list_head *tmp) | 597 | static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links) |
585 | { | 598 | { |
586 | struct cg_cgroup_link *link; | 599 | struct cgrp_cset_link *link; |
587 | int i; | 600 | int i; |
588 | INIT_LIST_HEAD(tmp); | 601 | |
602 | INIT_LIST_HEAD(tmp_links); | ||
603 | |||
589 | for (i = 0; i < count; i++) { | 604 | for (i = 0; i < count; i++) { |
590 | link = kmalloc(sizeof(*link), GFP_KERNEL); | 605 | link = kzalloc(sizeof(*link), GFP_KERNEL); |
591 | if (!link) { | 606 | if (!link) { |
592 | free_cg_links(tmp); | 607 | free_cgrp_cset_links(tmp_links); |
593 | return -ENOMEM; | 608 | return -ENOMEM; |
594 | } | 609 | } |
595 | list_add(&link->cgrp_link_list, tmp); | 610 | list_add(&link->cset_link, tmp_links); |
596 | } | 611 | } |
597 | return 0; | 612 | return 0; |
598 | } | 613 | } |
599 | 614 | ||
600 | /** | 615 | /** |
601 | * link_css_set - a helper function to link a css_set to a cgroup | 616 | * link_css_set - a helper function to link a css_set to a cgroup |
602 | * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() | 617 | * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links() |
603 | * @cg: the css_set to be linked | 618 | * @cset: the css_set to be linked |
604 | * @cgrp: the destination cgroup | 619 | * @cgrp: the destination cgroup |
605 | */ | 620 | */ |
606 | static void link_css_set(struct list_head *tmp_cg_links, | 621 | static void link_css_set(struct list_head *tmp_links, struct css_set *cset, |
607 | struct css_set *cg, struct cgroup *cgrp) | 622 | struct cgroup *cgrp) |
608 | { | 623 | { |
609 | struct cg_cgroup_link *link; | 624 | struct cgrp_cset_link *link; |
610 | 625 | ||
611 | BUG_ON(list_empty(tmp_cg_links)); | 626 | BUG_ON(list_empty(tmp_links)); |
612 | link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, | 627 | link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); |
613 | cgrp_link_list); | 628 | link->cset = cset; |
614 | link->cg = cg; | ||
615 | link->cgrp = cgrp; | 629 | link->cgrp = cgrp; |
616 | atomic_inc(&cgrp->count); | 630 | list_move(&link->cset_link, &cgrp->cset_links); |
617 | list_move(&link->cgrp_link_list, &cgrp->css_sets); | ||
618 | /* | 631 | /* |
619 | * Always add links to the tail of the list so that the list | 632 | * Always add links to the tail of the list so that the list |
620 | * is sorted by order of hierarchy creation | 633 | * is sorted by order of hierarchy creation |
621 | */ | 634 | */ |
622 | list_add_tail(&link->cg_link_list, &cg->cg_links); | 635 | list_add_tail(&link->cgrp_link, &cset->cgrp_links); |
623 | } | 636 | } |
624 | 637 | ||
625 | /* | 638 | /** |
626 | * find_css_set() takes an existing cgroup group and a | 639 | * find_css_set - return a new css_set with one cgroup updated |
627 | * cgroup object, and returns a css_set object that's | 640 | * @old_cset: the baseline css_set |
628 | * equivalent to the old group, but with the given cgroup | 641 | * @cgrp: the cgroup to be updated |
629 | * substituted into the appropriate hierarchy. Must be called with | 642 | * |
630 | * cgroup_mutex held | 643 | * Return a new css_set that's equivalent to @old_cset, but with @cgrp |
644 | * substituted into the appropriate hierarchy. | ||
631 | */ | 645 | */ |
632 | static struct css_set *find_css_set( | 646 | static struct css_set *find_css_set(struct css_set *old_cset, |
633 | struct css_set *oldcg, struct cgroup *cgrp) | 647 | struct cgroup *cgrp) |
634 | { | 648 | { |
635 | struct css_set *res; | 649 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { }; |
636 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; | 650 | struct css_set *cset; |
637 | 651 | struct list_head tmp_links; | |
638 | struct list_head tmp_cg_links; | 652 | struct cgrp_cset_link *link; |
639 | |||
640 | struct cg_cgroup_link *link; | ||
641 | unsigned long key; | 653 | unsigned long key; |
642 | 654 | ||
655 | lockdep_assert_held(&cgroup_mutex); | ||
656 | |||
643 | /* First see if we already have a cgroup group that matches | 657 | /* First see if we already have a cgroup group that matches |
644 | * the desired set */ | 658 | * the desired set */ |
645 | read_lock(&css_set_lock); | 659 | read_lock(&css_set_lock); |
646 | res = find_existing_css_set(oldcg, cgrp, template); | 660 | cset = find_existing_css_set(old_cset, cgrp, template); |
647 | if (res) | 661 | if (cset) |
648 | get_css_set(res); | 662 | get_css_set(cset); |
649 | read_unlock(&css_set_lock); | 663 | read_unlock(&css_set_lock); |
650 | 664 | ||
651 | if (res) | 665 | if (cset) |
652 | return res; | 666 | return cset; |
653 | 667 | ||
654 | res = kmalloc(sizeof(*res), GFP_KERNEL); | 668 | cset = kzalloc(sizeof(*cset), GFP_KERNEL); |
655 | if (!res) | 669 | if (!cset) |
656 | return NULL; | 670 | return NULL; |
657 | 671 | ||
658 | /* Allocate all the cg_cgroup_link objects that we'll need */ | 672 | /* Allocate all the cgrp_cset_link objects that we'll need */ |
659 | if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { | 673 | if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) { |
660 | kfree(res); | 674 | kfree(cset); |
661 | return NULL; | 675 | return NULL; |
662 | } | 676 | } |
663 | 677 | ||
664 | atomic_set(&res->refcount, 1); | 678 | atomic_set(&cset->refcount, 1); |
665 | INIT_LIST_HEAD(&res->cg_links); | 679 | INIT_LIST_HEAD(&cset->cgrp_links); |
666 | INIT_LIST_HEAD(&res->tasks); | 680 | INIT_LIST_HEAD(&cset->tasks); |
667 | INIT_HLIST_NODE(&res->hlist); | 681 | INIT_HLIST_NODE(&cset->hlist); |
668 | 682 | ||
669 | /* Copy the set of subsystem state objects generated in | 683 | /* Copy the set of subsystem state objects generated in |
670 | * find_existing_css_set() */ | 684 | * find_existing_css_set() */ |
671 | memcpy(res->subsys, template, sizeof(res->subsys)); | 685 | memcpy(cset->subsys, template, sizeof(cset->subsys)); |
672 | 686 | ||
673 | write_lock(&css_set_lock); | 687 | write_lock(&css_set_lock); |
674 | /* Add reference counts and links from the new css_set. */ | 688 | /* Add reference counts and links from the new css_set. */ |
675 | list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { | 689 | list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { |
676 | struct cgroup *c = link->cgrp; | 690 | struct cgroup *c = link->cgrp; |
691 | |||
677 | if (c->root == cgrp->root) | 692 | if (c->root == cgrp->root) |
678 | c = cgrp; | 693 | c = cgrp; |
679 | link_css_set(&tmp_cg_links, res, c); | 694 | link_css_set(&tmp_links, cset, c); |
680 | } | 695 | } |
681 | 696 | ||
682 | BUG_ON(!list_empty(&tmp_cg_links)); | 697 | BUG_ON(!list_empty(&tmp_links)); |
683 | 698 | ||
684 | css_set_count++; | 699 | css_set_count++; |
685 | 700 | ||
686 | /* Add this cgroup group to the hash table */ | 701 | /* Add this cgroup group to the hash table */ |
687 | key = css_set_hash(res->subsys); | 702 | key = css_set_hash(cset->subsys); |
688 | hash_add(css_set_table, &res->hlist, key); | 703 | hash_add(css_set_table, &cset->hlist, key); |
689 | 704 | ||
690 | write_unlock(&css_set_lock); | 705 | write_unlock(&css_set_lock); |
691 | 706 | ||
692 | return res; | 707 | return cset; |
693 | } | 708 | } |
694 | 709 | ||
695 | /* | 710 | /* |
@@ -699,7 +714,7 @@ static struct css_set *find_css_set( | |||
699 | static struct cgroup *task_cgroup_from_root(struct task_struct *task, | 714 | static struct cgroup *task_cgroup_from_root(struct task_struct *task, |
700 | struct cgroupfs_root *root) | 715 | struct cgroupfs_root *root) |
701 | { | 716 | { |
702 | struct css_set *css; | 717 | struct css_set *cset; |
703 | struct cgroup *res = NULL; | 718 | struct cgroup *res = NULL; |
704 | 719 | ||
705 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | 720 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); |
@@ -709,13 +724,15 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
709 | * task can't change groups, so the only thing that can happen | 724 | * task can't change groups, so the only thing that can happen |
710 | * is that it exits and its css is set back to init_css_set. | 725 | * is that it exits and its css is set back to init_css_set. |
711 | */ | 726 | */ |
712 | css = task->cgroups; | 727 | cset = task_css_set(task); |
713 | if (css == &init_css_set) { | 728 | if (cset == &init_css_set) { |
714 | res = &root->top_cgroup; | 729 | res = &root->top_cgroup; |
715 | } else { | 730 | } else { |
716 | struct cg_cgroup_link *link; | 731 | struct cgrp_cset_link *link; |
717 | list_for_each_entry(link, &css->cg_links, cg_link_list) { | 732 | |
733 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { | ||
718 | struct cgroup *c = link->cgrp; | 734 | struct cgroup *c = link->cgrp; |
735 | |||
719 | if (c->root == root) { | 736 | if (c->root == root) { |
720 | res = c; | 737 | res = c; |
721 | break; | 738 | break; |
@@ -828,14 +845,14 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) | |||
828 | 845 | ||
829 | static void cgroup_free_fn(struct work_struct *work) | 846 | static void cgroup_free_fn(struct work_struct *work) |
830 | { | 847 | { |
831 | struct cgroup *cgrp = container_of(work, struct cgroup, free_work); | 848 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); |
832 | struct cgroup_subsys *ss; | 849 | struct cgroup_subsys *ss; |
833 | 850 | ||
834 | mutex_lock(&cgroup_mutex); | 851 | mutex_lock(&cgroup_mutex); |
835 | /* | 852 | /* |
836 | * Release the subsystem state objects. | 853 | * Release the subsystem state objects. |
837 | */ | 854 | */ |
838 | for_each_subsys(cgrp->root, ss) | 855 | for_each_root_subsys(cgrp->root, ss) |
839 | ss->css_free(cgrp); | 856 | ss->css_free(cgrp); |
840 | 857 | ||
841 | cgrp->root->number_of_cgroups--; | 858 | cgrp->root->number_of_cgroups--; |
@@ -873,7 +890,8 @@ static void cgroup_free_rcu(struct rcu_head *head) | |||
873 | { | 890 | { |
874 | struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); | 891 | struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); |
875 | 892 | ||
876 | schedule_work(&cgrp->free_work); | 893 | INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); |
894 | schedule_work(&cgrp->destroy_work); | ||
877 | } | 895 | } |
878 | 896 | ||
879 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 897 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
@@ -882,7 +900,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
882 | if (S_ISDIR(inode->i_mode)) { | 900 | if (S_ISDIR(inode->i_mode)) { |
883 | struct cgroup *cgrp = dentry->d_fsdata; | 901 | struct cgroup *cgrp = dentry->d_fsdata; |
884 | 902 | ||
885 | BUG_ON(!(cgroup_is_removed(cgrp))); | 903 | BUG_ON(!(cgroup_is_dead(cgrp))); |
886 | call_rcu(&cgrp->rcu_head, cgroup_free_rcu); | 904 | call_rcu(&cgrp->rcu_head, cgroup_free_rcu); |
887 | } else { | 905 | } else { |
888 | struct cfent *cfe = __d_cfe(dentry); | 906 | struct cfent *cfe = __d_cfe(dentry); |
@@ -950,7 +968,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, | |||
950 | struct cgroup *cgrp = __d_cgrp(dir); | 968 | struct cgroup *cgrp = __d_cgrp(dir); |
951 | struct cgroup_subsys *ss; | 969 | struct cgroup_subsys *ss; |
952 | 970 | ||
953 | for_each_subsys(cgrp->root, ss) { | 971 | for_each_root_subsys(cgrp->root, ss) { |
954 | struct cftype_set *set; | 972 | struct cftype_set *set; |
955 | if (!test_bit(ss->subsys_id, &subsys_mask)) | 973 | if (!test_bit(ss->subsys_id, &subsys_mask)) |
956 | continue; | 974 | continue; |
@@ -988,30 +1006,23 @@ static void cgroup_d_remove_dir(struct dentry *dentry) | |||
988 | * returns an error, no reference counts are touched. | 1006 | * returns an error, no reference counts are touched. |
989 | */ | 1007 | */ |
990 | static int rebind_subsystems(struct cgroupfs_root *root, | 1008 | static int rebind_subsystems(struct cgroupfs_root *root, |
991 | unsigned long final_subsys_mask) | 1009 | unsigned long added_mask, unsigned removed_mask) |
992 | { | 1010 | { |
993 | unsigned long added_mask, removed_mask; | ||
994 | struct cgroup *cgrp = &root->top_cgroup; | 1011 | struct cgroup *cgrp = &root->top_cgroup; |
1012 | struct cgroup_subsys *ss; | ||
995 | int i; | 1013 | int i; |
996 | 1014 | ||
997 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | 1015 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); |
998 | BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); | 1016 | BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); |
999 | 1017 | ||
1000 | removed_mask = root->actual_subsys_mask & ~final_subsys_mask; | ||
1001 | added_mask = final_subsys_mask & ~root->actual_subsys_mask; | ||
1002 | /* Check that any added subsystems are currently free */ | 1018 | /* Check that any added subsystems are currently free */ |
1003 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1019 | for_each_subsys(ss, i) { |
1004 | unsigned long bit = 1UL << i; | 1020 | unsigned long bit = 1UL << i; |
1005 | struct cgroup_subsys *ss = subsys[i]; | 1021 | |
1006 | if (!(bit & added_mask)) | 1022 | if (!(bit & added_mask)) |
1007 | continue; | 1023 | continue; |
1008 | /* | 1024 | |
1009 | * Nobody should tell us to do a subsys that doesn't exist: | 1025 | if (ss->root != &cgroup_dummy_root) { |
1010 | * parse_cgroupfs_options should catch that case and refcounts | ||
1011 | * ensure that subsystems won't disappear once selected. | ||
1012 | */ | ||
1013 | BUG_ON(ss == NULL); | ||
1014 | if (ss->root != &rootnode) { | ||
1015 | /* Subsystem isn't free */ | 1026 | /* Subsystem isn't free */ |
1016 | return -EBUSY; | 1027 | return -EBUSY; |
1017 | } | 1028 | } |
@@ -1025,38 +1036,41 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1025 | return -EBUSY; | 1036 | return -EBUSY; |
1026 | 1037 | ||
1027 | /* Process each subsystem */ | 1038 | /* Process each subsystem */ |
1028 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1039 | for_each_subsys(ss, i) { |
1029 | struct cgroup_subsys *ss = subsys[i]; | ||
1030 | unsigned long bit = 1UL << i; | 1040 | unsigned long bit = 1UL << i; |
1041 | |||
1031 | if (bit & added_mask) { | 1042 | if (bit & added_mask) { |
1032 | /* We're binding this subsystem to this hierarchy */ | 1043 | /* We're binding this subsystem to this hierarchy */ |
1033 | BUG_ON(ss == NULL); | ||
1034 | BUG_ON(cgrp->subsys[i]); | 1044 | BUG_ON(cgrp->subsys[i]); |
1035 | BUG_ON(!dummytop->subsys[i]); | 1045 | BUG_ON(!cgroup_dummy_top->subsys[i]); |
1036 | BUG_ON(dummytop->subsys[i]->cgroup != dummytop); | 1046 | BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); |
1037 | cgrp->subsys[i] = dummytop->subsys[i]; | 1047 | |
1048 | cgrp->subsys[i] = cgroup_dummy_top->subsys[i]; | ||
1038 | cgrp->subsys[i]->cgroup = cgrp; | 1049 | cgrp->subsys[i]->cgroup = cgrp; |
1039 | list_move(&ss->sibling, &root->subsys_list); | 1050 | list_move(&ss->sibling, &root->subsys_list); |
1040 | ss->root = root; | 1051 | ss->root = root; |
1041 | if (ss->bind) | 1052 | if (ss->bind) |
1042 | ss->bind(cgrp); | 1053 | ss->bind(cgrp); |
1054 | |||
1043 | /* refcount was already taken, and we're keeping it */ | 1055 | /* refcount was already taken, and we're keeping it */ |
1056 | root->subsys_mask |= bit; | ||
1044 | } else if (bit & removed_mask) { | 1057 | } else if (bit & removed_mask) { |
1045 | /* We're removing this subsystem */ | 1058 | /* We're removing this subsystem */ |
1046 | BUG_ON(ss == NULL); | 1059 | BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); |
1047 | BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); | ||
1048 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); | 1060 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); |
1061 | |||
1049 | if (ss->bind) | 1062 | if (ss->bind) |
1050 | ss->bind(dummytop); | 1063 | ss->bind(cgroup_dummy_top); |
1051 | dummytop->subsys[i]->cgroup = dummytop; | 1064 | cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; |
1052 | cgrp->subsys[i] = NULL; | 1065 | cgrp->subsys[i] = NULL; |
1053 | subsys[i]->root = &rootnode; | 1066 | cgroup_subsys[i]->root = &cgroup_dummy_root; |
1054 | list_move(&ss->sibling, &rootnode.subsys_list); | 1067 | list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); |
1068 | |||
1055 | /* subsystem is now free - drop reference on module */ | 1069 | /* subsystem is now free - drop reference on module */ |
1056 | module_put(ss->module); | 1070 | module_put(ss->module); |
1057 | } else if (bit & final_subsys_mask) { | 1071 | root->subsys_mask &= ~bit; |
1072 | } else if (bit & root->subsys_mask) { | ||
1058 | /* Subsystem state should already exist */ | 1073 | /* Subsystem state should already exist */ |
1059 | BUG_ON(ss == NULL); | ||
1060 | BUG_ON(!cgrp->subsys[i]); | 1074 | BUG_ON(!cgrp->subsys[i]); |
1061 | /* | 1075 | /* |
1062 | * a refcount was taken, but we already had one, so | 1076 | * a refcount was taken, but we already had one, so |
@@ -1071,7 +1085,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1071 | BUG_ON(cgrp->subsys[i]); | 1085 | BUG_ON(cgrp->subsys[i]); |
1072 | } | 1086 | } |
1073 | } | 1087 | } |
1074 | root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; | 1088 | |
1089 | /* | ||
1090 | * Mark @root has finished binding subsystems. @root->subsys_mask | ||
1091 | * now matches the bound subsystems. | ||
1092 | */ | ||
1093 | root->flags |= CGRP_ROOT_SUBSYS_BOUND; | ||
1075 | 1094 | ||
1076 | return 0; | 1095 | return 0; |
1077 | } | 1096 | } |
@@ -1082,7 +1101,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) | |||
1082 | struct cgroup_subsys *ss; | 1101 | struct cgroup_subsys *ss; |
1083 | 1102 | ||
1084 | mutex_lock(&cgroup_root_mutex); | 1103 | mutex_lock(&cgroup_root_mutex); |
1085 | for_each_subsys(root, ss) | 1104 | for_each_root_subsys(root, ss) |
1086 | seq_printf(seq, ",%s", ss->name); | 1105 | seq_printf(seq, ",%s", ss->name); |
1087 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) | 1106 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) |
1088 | seq_puts(seq, ",sane_behavior"); | 1107 | seq_puts(seq, ",sane_behavior"); |
@@ -1114,18 +1133,19 @@ struct cgroup_sb_opts { | |||
1114 | }; | 1133 | }; |
1115 | 1134 | ||
1116 | /* | 1135 | /* |
1117 | * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call | 1136 | * Convert a hierarchy specifier into a bitmask of subsystems and |
1118 | * with cgroup_mutex held to protect the subsys[] array. This function takes | 1137 | * flags. Call with cgroup_mutex held to protect the cgroup_subsys[] |
1119 | * refcounts on subsystems to be used, unless it returns error, in which case | 1138 | * array. This function takes refcounts on subsystems to be used, unless it |
1120 | * no refcounts are taken. | 1139 | * returns error, in which case no refcounts are taken. |
1121 | */ | 1140 | */ |
1122 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | 1141 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) |
1123 | { | 1142 | { |
1124 | char *token, *o = data; | 1143 | char *token, *o = data; |
1125 | bool all_ss = false, one_ss = false; | 1144 | bool all_ss = false, one_ss = false; |
1126 | unsigned long mask = (unsigned long)-1; | 1145 | unsigned long mask = (unsigned long)-1; |
1127 | int i; | ||
1128 | bool module_pin_failed = false; | 1146 | bool module_pin_failed = false; |
1147 | struct cgroup_subsys *ss; | ||
1148 | int i; | ||
1129 | 1149 | ||
1130 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | 1150 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); |
1131 | 1151 | ||
@@ -1202,10 +1222,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1202 | continue; | 1222 | continue; |
1203 | } | 1223 | } |
1204 | 1224 | ||
1205 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1225 | for_each_subsys(ss, i) { |
1206 | struct cgroup_subsys *ss = subsys[i]; | ||
1207 | if (ss == NULL) | ||
1208 | continue; | ||
1209 | if (strcmp(token, ss->name)) | 1226 | if (strcmp(token, ss->name)) |
1210 | continue; | 1227 | continue; |
1211 | if (ss->disabled) | 1228 | if (ss->disabled) |
@@ -1228,16 +1245,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1228 | * otherwise if 'none', 'name=' and a subsystem name options | 1245 | * otherwise if 'none', 'name=' and a subsystem name options |
1229 | * were not specified, let's default to 'all' | 1246 | * were not specified, let's default to 'all' |
1230 | */ | 1247 | */ |
1231 | if (all_ss || (!one_ss && !opts->none && !opts->name)) { | 1248 | if (all_ss || (!one_ss && !opts->none && !opts->name)) |
1232 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1249 | for_each_subsys(ss, i) |
1233 | struct cgroup_subsys *ss = subsys[i]; | 1250 | if (!ss->disabled) |
1234 | if (ss == NULL) | 1251 | set_bit(i, &opts->subsys_mask); |
1235 | continue; | ||
1236 | if (ss->disabled) | ||
1237 | continue; | ||
1238 | set_bit(i, &opts->subsys_mask); | ||
1239 | } | ||
1240 | } | ||
1241 | 1252 | ||
1242 | /* Consistency checks */ | 1253 | /* Consistency checks */ |
1243 | 1254 | ||
@@ -1281,12 +1292,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1281 | * take duplicate reference counts on a subsystem that's already used, | 1292 | * take duplicate reference counts on a subsystem that's already used, |
1282 | * but rebind_subsystems handles this case. | 1293 | * but rebind_subsystems handles this case. |
1283 | */ | 1294 | */ |
1284 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1295 | for_each_subsys(ss, i) { |
1285 | unsigned long bit = 1UL << i; | 1296 | if (!(opts->subsys_mask & (1UL << i))) |
1286 | |||
1287 | if (!(bit & opts->subsys_mask)) | ||
1288 | continue; | 1297 | continue; |
1289 | if (!try_module_get(subsys[i]->module)) { | 1298 | if (!try_module_get(cgroup_subsys[i]->module)) { |
1290 | module_pin_failed = true; | 1299 | module_pin_failed = true; |
1291 | break; | 1300 | break; |
1292 | } | 1301 | } |
@@ -1303,7 +1312,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1303 | 1312 | ||
1304 | if (!(bit & opts->subsys_mask)) | 1313 | if (!(bit & opts->subsys_mask)) |
1305 | continue; | 1314 | continue; |
1306 | module_put(subsys[i]->module); | 1315 | module_put(cgroup_subsys[i]->module); |
1307 | } | 1316 | } |
1308 | return -ENOENT; | 1317 | return -ENOENT; |
1309 | } | 1318 | } |
@@ -1313,14 +1322,14 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1313 | 1322 | ||
1314 | static void drop_parsed_module_refcounts(unsigned long subsys_mask) | 1323 | static void drop_parsed_module_refcounts(unsigned long subsys_mask) |
1315 | { | 1324 | { |
1325 | struct cgroup_subsys *ss; | ||
1316 | int i; | 1326 | int i; |
1317 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
1318 | unsigned long bit = 1UL << i; | ||
1319 | 1327 | ||
1320 | if (!(bit & subsys_mask)) | 1328 | mutex_lock(&cgroup_mutex); |
1321 | continue; | 1329 | for_each_subsys(ss, i) |
1322 | module_put(subsys[i]->module); | 1330 | if (subsys_mask & (1UL << i)) |
1323 | } | 1331 | module_put(cgroup_subsys[i]->module); |
1332 | mutex_unlock(&cgroup_mutex); | ||
1324 | } | 1333 | } |
1325 | 1334 | ||
1326 | static int cgroup_remount(struct super_block *sb, int *flags, char *data) | 1335 | static int cgroup_remount(struct super_block *sb, int *flags, char *data) |
@@ -1345,7 +1354,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1345 | if (ret) | 1354 | if (ret) |
1346 | goto out_unlock; | 1355 | goto out_unlock; |
1347 | 1356 | ||
1348 | if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) | 1357 | if (opts.subsys_mask != root->subsys_mask || opts.release_agent) |
1349 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", | 1358 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", |
1350 | task_tgid_nr(current), current->comm); | 1359 | task_tgid_nr(current), current->comm); |
1351 | 1360 | ||
@@ -1353,10 +1362,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1353 | removed_mask = root->subsys_mask & ~opts.subsys_mask; | 1362 | removed_mask = root->subsys_mask & ~opts.subsys_mask; |
1354 | 1363 | ||
1355 | /* Don't allow flags or name to change at remount */ | 1364 | /* Don't allow flags or name to change at remount */ |
1356 | if (opts.flags != root->flags || | 1365 | if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || |
1357 | (opts.name && strcmp(opts.name, root->name))) { | 1366 | (opts.name && strcmp(opts.name, root->name))) { |
1367 | pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", | ||
1368 | opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", | ||
1369 | root->flags & CGRP_ROOT_OPTION_MASK, root->name); | ||
1358 | ret = -EINVAL; | 1370 | ret = -EINVAL; |
1359 | drop_parsed_module_refcounts(opts.subsys_mask); | ||
1360 | goto out_unlock; | 1371 | goto out_unlock; |
1361 | } | 1372 | } |
1362 | 1373 | ||
@@ -1367,11 +1378,10 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1367 | */ | 1378 | */ |
1368 | cgroup_clear_directory(cgrp->dentry, false, removed_mask); | 1379 | cgroup_clear_directory(cgrp->dentry, false, removed_mask); |
1369 | 1380 | ||
1370 | ret = rebind_subsystems(root, opts.subsys_mask); | 1381 | ret = rebind_subsystems(root, added_mask, removed_mask); |
1371 | if (ret) { | 1382 | if (ret) { |
1372 | /* rebind_subsystems failed, re-populate the removed files */ | 1383 | /* rebind_subsystems failed, re-populate the removed files */ |
1373 | cgroup_populate_dir(cgrp, false, removed_mask); | 1384 | cgroup_populate_dir(cgrp, false, removed_mask); |
1374 | drop_parsed_module_refcounts(opts.subsys_mask); | ||
1375 | goto out_unlock; | 1385 | goto out_unlock; |
1376 | } | 1386 | } |
1377 | 1387 | ||
@@ -1386,6 +1396,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1386 | mutex_unlock(&cgroup_root_mutex); | 1396 | mutex_unlock(&cgroup_root_mutex); |
1387 | mutex_unlock(&cgroup_mutex); | 1397 | mutex_unlock(&cgroup_mutex); |
1388 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 1398 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
1399 | if (ret) | ||
1400 | drop_parsed_module_refcounts(opts.subsys_mask); | ||
1389 | return ret; | 1401 | return ret; |
1390 | } | 1402 | } |
1391 | 1403 | ||
@@ -1401,11 +1413,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1401 | INIT_LIST_HEAD(&cgrp->sibling); | 1413 | INIT_LIST_HEAD(&cgrp->sibling); |
1402 | INIT_LIST_HEAD(&cgrp->children); | 1414 | INIT_LIST_HEAD(&cgrp->children); |
1403 | INIT_LIST_HEAD(&cgrp->files); | 1415 | INIT_LIST_HEAD(&cgrp->files); |
1404 | INIT_LIST_HEAD(&cgrp->css_sets); | 1416 | INIT_LIST_HEAD(&cgrp->cset_links); |
1405 | INIT_LIST_HEAD(&cgrp->allcg_node); | ||
1406 | INIT_LIST_HEAD(&cgrp->release_list); | 1417 | INIT_LIST_HEAD(&cgrp->release_list); |
1407 | INIT_LIST_HEAD(&cgrp->pidlists); | 1418 | INIT_LIST_HEAD(&cgrp->pidlists); |
1408 | INIT_WORK(&cgrp->free_work, cgroup_free_fn); | ||
1409 | mutex_init(&cgrp->pidlist_mutex); | 1419 | mutex_init(&cgrp->pidlist_mutex); |
1410 | INIT_LIST_HEAD(&cgrp->event_list); | 1420 | INIT_LIST_HEAD(&cgrp->event_list); |
1411 | spin_lock_init(&cgrp->event_list_lock); | 1421 | spin_lock_init(&cgrp->event_list_lock); |
@@ -1418,37 +1428,37 @@ static void init_cgroup_root(struct cgroupfs_root *root) | |||
1418 | 1428 | ||
1419 | INIT_LIST_HEAD(&root->subsys_list); | 1429 | INIT_LIST_HEAD(&root->subsys_list); |
1420 | INIT_LIST_HEAD(&root->root_list); | 1430 | INIT_LIST_HEAD(&root->root_list); |
1421 | INIT_LIST_HEAD(&root->allcg_list); | ||
1422 | root->number_of_cgroups = 1; | 1431 | root->number_of_cgroups = 1; |
1423 | cgrp->root = root; | 1432 | cgrp->root = root; |
1424 | cgrp->name = &root_cgroup_name; | 1433 | RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); |
1425 | init_cgroup_housekeeping(cgrp); | 1434 | init_cgroup_housekeeping(cgrp); |
1426 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
1427 | } | 1435 | } |
1428 | 1436 | ||
1429 | static bool init_root_id(struct cgroupfs_root *root) | 1437 | static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) |
1430 | { | 1438 | { |
1431 | int ret = 0; | 1439 | int id; |
1432 | 1440 | ||
1433 | do { | 1441 | lockdep_assert_held(&cgroup_mutex); |
1434 | if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) | 1442 | lockdep_assert_held(&cgroup_root_mutex); |
1435 | return false; | 1443 | |
1436 | spin_lock(&hierarchy_id_lock); | 1444 | id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end, |
1437 | /* Try to allocate the next unused ID */ | 1445 | GFP_KERNEL); |
1438 | ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, | 1446 | if (id < 0) |
1439 | &root->hierarchy_id); | 1447 | return id; |
1440 | if (ret == -ENOSPC) | 1448 | |
1441 | /* Try again starting from 0 */ | 1449 | root->hierarchy_id = id; |
1442 | ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); | 1450 | return 0; |
1443 | if (!ret) { | 1451 | } |
1444 | next_hierarchy_id = root->hierarchy_id + 1; | 1452 | |
1445 | } else if (ret != -EAGAIN) { | 1453 | static void cgroup_exit_root_id(struct cgroupfs_root *root) |
1446 | /* Can only get here if the 31-bit IDR is full ... */ | 1454 | { |
1447 | BUG_ON(ret); | 1455 | lockdep_assert_held(&cgroup_mutex); |
1448 | } | 1456 | lockdep_assert_held(&cgroup_root_mutex); |
1449 | spin_unlock(&hierarchy_id_lock); | 1457 | |
1450 | } while (ret); | 1458 | if (root->hierarchy_id) { |
1451 | return true; | 1459 | idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); |
1460 | root->hierarchy_id = 0; | ||
1461 | } | ||
1452 | } | 1462 | } |
1453 | 1463 | ||
1454 | static int cgroup_test_super(struct super_block *sb, void *data) | 1464 | static int cgroup_test_super(struct super_block *sb, void *data) |
@@ -1482,12 +1492,16 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
1482 | if (!root) | 1492 | if (!root) |
1483 | return ERR_PTR(-ENOMEM); | 1493 | return ERR_PTR(-ENOMEM); |
1484 | 1494 | ||
1485 | if (!init_root_id(root)) { | ||
1486 | kfree(root); | ||
1487 | return ERR_PTR(-ENOMEM); | ||
1488 | } | ||
1489 | init_cgroup_root(root); | 1495 | init_cgroup_root(root); |
1490 | 1496 | ||
1497 | /* | ||
1498 | * We need to set @root->subsys_mask now so that @root can be | ||
1499 | * matched by cgroup_test_super() before it finishes | ||
1500 | * initialization; otherwise, competing mounts with the same | ||
1501 | * options may try to bind the same subsystems instead of waiting | ||
1502 | * for the first one leading to unexpected mount errors. | ||
1503 | * SUBSYS_BOUND will be set once actual binding is complete. | ||
1504 | */ | ||
1491 | root->subsys_mask = opts->subsys_mask; | 1505 | root->subsys_mask = opts->subsys_mask; |
1492 | root->flags = opts->flags; | 1506 | root->flags = opts->flags; |
1493 | ida_init(&root->cgroup_ida); | 1507 | ida_init(&root->cgroup_ida); |
@@ -1500,17 +1514,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
1500 | return root; | 1514 | return root; |
1501 | } | 1515 | } |
1502 | 1516 | ||
1503 | static void cgroup_drop_root(struct cgroupfs_root *root) | 1517 | static void cgroup_free_root(struct cgroupfs_root *root) |
1504 | { | 1518 | { |
1505 | if (!root) | 1519 | if (root) { |
1506 | return; | 1520 | /* hierarhcy ID shoulid already have been released */ |
1521 | WARN_ON_ONCE(root->hierarchy_id); | ||
1507 | 1522 | ||
1508 | BUG_ON(!root->hierarchy_id); | 1523 | ida_destroy(&root->cgroup_ida); |
1509 | spin_lock(&hierarchy_id_lock); | 1524 | kfree(root); |
1510 | ida_remove(&hierarchy_ida, root->hierarchy_id); | 1525 | } |
1511 | spin_unlock(&hierarchy_id_lock); | ||
1512 | ida_destroy(&root->cgroup_ida); | ||
1513 | kfree(root); | ||
1514 | } | 1526 | } |
1515 | 1527 | ||
1516 | static int cgroup_set_super(struct super_block *sb, void *data) | 1528 | static int cgroup_set_super(struct super_block *sb, void *data) |
@@ -1597,7 +1609,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1597 | sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); | 1609 | sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); |
1598 | if (IS_ERR(sb)) { | 1610 | if (IS_ERR(sb)) { |
1599 | ret = PTR_ERR(sb); | 1611 | ret = PTR_ERR(sb); |
1600 | cgroup_drop_root(opts.new_root); | 1612 | cgroup_free_root(opts.new_root); |
1601 | goto drop_modules; | 1613 | goto drop_modules; |
1602 | } | 1614 | } |
1603 | 1615 | ||
@@ -1605,12 +1617,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1605 | BUG_ON(!root); | 1617 | BUG_ON(!root); |
1606 | if (root == opts.new_root) { | 1618 | if (root == opts.new_root) { |
1607 | /* We used the new root structure, so this is a new hierarchy */ | 1619 | /* We used the new root structure, so this is a new hierarchy */ |
1608 | struct list_head tmp_cg_links; | 1620 | struct list_head tmp_links; |
1609 | struct cgroup *root_cgrp = &root->top_cgroup; | 1621 | struct cgroup *root_cgrp = &root->top_cgroup; |
1610 | struct cgroupfs_root *existing_root; | 1622 | struct cgroupfs_root *existing_root; |
1611 | const struct cred *cred; | 1623 | const struct cred *cred; |
1612 | int i; | 1624 | int i; |
1613 | struct css_set *cg; | 1625 | struct css_set *cset; |
1614 | 1626 | ||
1615 | BUG_ON(sb->s_root != NULL); | 1627 | BUG_ON(sb->s_root != NULL); |
1616 | 1628 | ||
@@ -1637,13 +1649,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1637 | * that's us. The worst that can happen is that we | 1649 | * that's us. The worst that can happen is that we |
1638 | * have some link structures left over | 1650 | * have some link structures left over |
1639 | */ | 1651 | */ |
1640 | ret = allocate_cg_links(css_set_count, &tmp_cg_links); | 1652 | ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); |
1641 | if (ret) | 1653 | if (ret) |
1642 | goto unlock_drop; | 1654 | goto unlock_drop; |
1643 | 1655 | ||
1644 | ret = rebind_subsystems(root, root->subsys_mask); | 1656 | /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ |
1657 | ret = cgroup_init_root_id(root, 2, 0); | ||
1658 | if (ret) | ||
1659 | goto unlock_drop; | ||
1660 | |||
1661 | ret = rebind_subsystems(root, root->subsys_mask, 0); | ||
1645 | if (ret == -EBUSY) { | 1662 | if (ret == -EBUSY) { |
1646 | free_cg_links(&tmp_cg_links); | 1663 | free_cgrp_cset_links(&tmp_links); |
1647 | goto unlock_drop; | 1664 | goto unlock_drop; |
1648 | } | 1665 | } |
1649 | /* | 1666 | /* |
@@ -1655,8 +1672,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1655 | /* EBUSY should be the only error here */ | 1672 | /* EBUSY should be the only error here */ |
1656 | BUG_ON(ret); | 1673 | BUG_ON(ret); |
1657 | 1674 | ||
1658 | list_add(&root->root_list, &roots); | 1675 | list_add(&root->root_list, &cgroup_roots); |
1659 | root_count++; | 1676 | cgroup_root_count++; |
1660 | 1677 | ||
1661 | sb->s_root->d_fsdata = root_cgrp; | 1678 | sb->s_root->d_fsdata = root_cgrp; |
1662 | root->top_cgroup.dentry = sb->s_root; | 1679 | root->top_cgroup.dentry = sb->s_root; |
@@ -1664,11 +1681,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1664 | /* Link the top cgroup in this hierarchy into all | 1681 | /* Link the top cgroup in this hierarchy into all |
1665 | * the css_set objects */ | 1682 | * the css_set objects */ |
1666 | write_lock(&css_set_lock); | 1683 | write_lock(&css_set_lock); |
1667 | hash_for_each(css_set_table, i, cg, hlist) | 1684 | hash_for_each(css_set_table, i, cset, hlist) |
1668 | link_css_set(&tmp_cg_links, cg, root_cgrp); | 1685 | link_css_set(&tmp_links, cset, root_cgrp); |
1669 | write_unlock(&css_set_lock); | 1686 | write_unlock(&css_set_lock); |
1670 | 1687 | ||
1671 | free_cg_links(&tmp_cg_links); | 1688 | free_cgrp_cset_links(&tmp_links); |
1672 | 1689 | ||
1673 | BUG_ON(!list_empty(&root_cgrp->children)); | 1690 | BUG_ON(!list_empty(&root_cgrp->children)); |
1674 | BUG_ON(root->number_of_cgroups != 1); | 1691 | BUG_ON(root->number_of_cgroups != 1); |
@@ -1684,9 +1701,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1684 | * We re-used an existing hierarchy - the new root (if | 1701 | * We re-used an existing hierarchy - the new root (if |
1685 | * any) is not needed | 1702 | * any) is not needed |
1686 | */ | 1703 | */ |
1687 | cgroup_drop_root(opts.new_root); | 1704 | cgroup_free_root(opts.new_root); |
1688 | 1705 | ||
1689 | if (root->flags != opts.flags) { | 1706 | if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { |
1690 | if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { | 1707 | if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { |
1691 | pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); | 1708 | pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); |
1692 | ret = -EINVAL; | 1709 | ret = -EINVAL; |
@@ -1705,6 +1722,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1705 | return dget(sb->s_root); | 1722 | return dget(sb->s_root); |
1706 | 1723 | ||
1707 | unlock_drop: | 1724 | unlock_drop: |
1725 | cgroup_exit_root_id(root); | ||
1708 | mutex_unlock(&cgroup_root_mutex); | 1726 | mutex_unlock(&cgroup_root_mutex); |
1709 | mutex_unlock(&cgroup_mutex); | 1727 | mutex_unlock(&cgroup_mutex); |
1710 | mutex_unlock(&inode->i_mutex); | 1728 | mutex_unlock(&inode->i_mutex); |
@@ -1721,9 +1739,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1721 | static void cgroup_kill_sb(struct super_block *sb) { | 1739 | static void cgroup_kill_sb(struct super_block *sb) { |
1722 | struct cgroupfs_root *root = sb->s_fs_info; | 1740 | struct cgroupfs_root *root = sb->s_fs_info; |
1723 | struct cgroup *cgrp = &root->top_cgroup; | 1741 | struct cgroup *cgrp = &root->top_cgroup; |
1742 | struct cgrp_cset_link *link, *tmp_link; | ||
1724 | int ret; | 1743 | int ret; |
1725 | struct cg_cgroup_link *link; | ||
1726 | struct cg_cgroup_link *saved_link; | ||
1727 | 1744 | ||
1728 | BUG_ON(!root); | 1745 | BUG_ON(!root); |
1729 | 1746 | ||
@@ -1734,36 +1751,39 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1734 | mutex_lock(&cgroup_root_mutex); | 1751 | mutex_lock(&cgroup_root_mutex); |
1735 | 1752 | ||
1736 | /* Rebind all subsystems back to the default hierarchy */ | 1753 | /* Rebind all subsystems back to the default hierarchy */ |
1737 | ret = rebind_subsystems(root, 0); | 1754 | if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { |
1738 | /* Shouldn't be able to fail ... */ | 1755 | ret = rebind_subsystems(root, 0, root->subsys_mask); |
1739 | BUG_ON(ret); | 1756 | /* Shouldn't be able to fail ... */ |
1757 | BUG_ON(ret); | ||
1758 | } | ||
1740 | 1759 | ||
1741 | /* | 1760 | /* |
1742 | * Release all the links from css_sets to this hierarchy's | 1761 | * Release all the links from cset_links to this hierarchy's |
1743 | * root cgroup | 1762 | * root cgroup |
1744 | */ | 1763 | */ |
1745 | write_lock(&css_set_lock); | 1764 | write_lock(&css_set_lock); |
1746 | 1765 | ||
1747 | list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, | 1766 | list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { |
1748 | cgrp_link_list) { | 1767 | list_del(&link->cset_link); |
1749 | list_del(&link->cg_link_list); | 1768 | list_del(&link->cgrp_link); |
1750 | list_del(&link->cgrp_link_list); | ||
1751 | kfree(link); | 1769 | kfree(link); |
1752 | } | 1770 | } |
1753 | write_unlock(&css_set_lock); | 1771 | write_unlock(&css_set_lock); |
1754 | 1772 | ||
1755 | if (!list_empty(&root->root_list)) { | 1773 | if (!list_empty(&root->root_list)) { |
1756 | list_del(&root->root_list); | 1774 | list_del(&root->root_list); |
1757 | root_count--; | 1775 | cgroup_root_count--; |
1758 | } | 1776 | } |
1759 | 1777 | ||
1778 | cgroup_exit_root_id(root); | ||
1779 | |||
1760 | mutex_unlock(&cgroup_root_mutex); | 1780 | mutex_unlock(&cgroup_root_mutex); |
1761 | mutex_unlock(&cgroup_mutex); | 1781 | mutex_unlock(&cgroup_mutex); |
1762 | 1782 | ||
1763 | simple_xattrs_free(&cgrp->xattrs); | 1783 | simple_xattrs_free(&cgrp->xattrs); |
1764 | 1784 | ||
1765 | kill_litter_super(sb); | 1785 | kill_litter_super(sb); |
1766 | cgroup_drop_root(root); | 1786 | cgroup_free_root(root); |
1767 | } | 1787 | } |
1768 | 1788 | ||
1769 | static struct file_system_type cgroup_fs_type = { | 1789 | static struct file_system_type cgroup_fs_type = { |
@@ -1825,6 +1845,38 @@ out: | |||
1825 | } | 1845 | } |
1826 | EXPORT_SYMBOL_GPL(cgroup_path); | 1846 | EXPORT_SYMBOL_GPL(cgroup_path); |
1827 | 1847 | ||
1848 | /** | ||
1849 | * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy | ||
1850 | * @task: target task | ||
1851 | * @hierarchy_id: the hierarchy to look up @task's cgroup from | ||
1852 | * @buf: the buffer to write the path into | ||
1853 | * @buflen: the length of the buffer | ||
1854 | * | ||
1855 | * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and | ||
1856 | * copy its path into @buf. This function grabs cgroup_mutex and shouldn't | ||
1857 | * be used inside locks used by cgroup controller callbacks. | ||
1858 | */ | ||
1859 | int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id, | ||
1860 | char *buf, size_t buflen) | ||
1861 | { | ||
1862 | struct cgroupfs_root *root; | ||
1863 | struct cgroup *cgrp = NULL; | ||
1864 | int ret = -ENOENT; | ||
1865 | |||
1866 | mutex_lock(&cgroup_mutex); | ||
1867 | |||
1868 | root = idr_find(&cgroup_hierarchy_idr, hierarchy_id); | ||
1869 | if (root) { | ||
1870 | cgrp = task_cgroup_from_root(task, root); | ||
1871 | ret = cgroup_path(cgrp, buf, buflen); | ||
1872 | } | ||
1873 | |||
1874 | mutex_unlock(&cgroup_mutex); | ||
1875 | |||
1876 | return ret; | ||
1877 | } | ||
1878 | EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy); | ||
1879 | |||
1828 | /* | 1880 | /* |
1829 | * Control Group taskset | 1881 | * Control Group taskset |
1830 | */ | 1882 | */ |
@@ -1910,10 +1962,11 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); | |||
1910 | * | 1962 | * |
1911 | * Must be called with cgroup_mutex and threadgroup locked. | 1963 | * Must be called with cgroup_mutex and threadgroup locked. |
1912 | */ | 1964 | */ |
1913 | static void cgroup_task_migrate(struct cgroup *oldcgrp, | 1965 | static void cgroup_task_migrate(struct cgroup *old_cgrp, |
1914 | struct task_struct *tsk, struct css_set *newcg) | 1966 | struct task_struct *tsk, |
1967 | struct css_set *new_cset) | ||
1915 | { | 1968 | { |
1916 | struct css_set *oldcg; | 1969 | struct css_set *old_cset; |
1917 | 1970 | ||
1918 | /* | 1971 | /* |
1919 | * We are synchronized through threadgroup_lock() against PF_EXITING | 1972 | * We are synchronized through threadgroup_lock() against PF_EXITING |
@@ -1921,25 +1974,25 @@ static void cgroup_task_migrate(struct cgroup *oldcgrp, | |||
1921 | * css_set to init_css_set and dropping the old one. | 1974 | * css_set to init_css_set and dropping the old one. |
1922 | */ | 1975 | */ |
1923 | WARN_ON_ONCE(tsk->flags & PF_EXITING); | 1976 | WARN_ON_ONCE(tsk->flags & PF_EXITING); |
1924 | oldcg = tsk->cgroups; | 1977 | old_cset = task_css_set(tsk); |
1925 | 1978 | ||
1926 | task_lock(tsk); | 1979 | task_lock(tsk); |
1927 | rcu_assign_pointer(tsk->cgroups, newcg); | 1980 | rcu_assign_pointer(tsk->cgroups, new_cset); |
1928 | task_unlock(tsk); | 1981 | task_unlock(tsk); |
1929 | 1982 | ||
1930 | /* Update the css_set linked lists if we're using them */ | 1983 | /* Update the css_set linked lists if we're using them */ |
1931 | write_lock(&css_set_lock); | 1984 | write_lock(&css_set_lock); |
1932 | if (!list_empty(&tsk->cg_list)) | 1985 | if (!list_empty(&tsk->cg_list)) |
1933 | list_move(&tsk->cg_list, &newcg->tasks); | 1986 | list_move(&tsk->cg_list, &new_cset->tasks); |
1934 | write_unlock(&css_set_lock); | 1987 | write_unlock(&css_set_lock); |
1935 | 1988 | ||
1936 | /* | 1989 | /* |
1937 | * We just gained a reference on oldcg by taking it from the task. As | 1990 | * We just gained a reference on old_cset by taking it from the |
1938 | * trading it for newcg is protected by cgroup_mutex, we're safe to drop | 1991 | * task. As trading it for new_cset is protected by cgroup_mutex, |
1939 | * it here; it will be freed under RCU. | 1992 | * we're safe to drop it here; it will be freed under RCU. |
1940 | */ | 1993 | */ |
1941 | set_bit(CGRP_RELEASABLE, &oldcgrp->flags); | 1994 | set_bit(CGRP_RELEASABLE, &old_cgrp->flags); |
1942 | put_css_set(oldcg); | 1995 | put_css_set(old_cset); |
1943 | } | 1996 | } |
1944 | 1997 | ||
1945 | /** | 1998 | /** |
@@ -2029,7 +2082,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2029 | /* | 2082 | /* |
2030 | * step 1: check that we can legitimately attach to the cgroup. | 2083 | * step 1: check that we can legitimately attach to the cgroup. |
2031 | */ | 2084 | */ |
2032 | for_each_subsys(root, ss) { | 2085 | for_each_root_subsys(root, ss) { |
2033 | if (ss->can_attach) { | 2086 | if (ss->can_attach) { |
2034 | retval = ss->can_attach(cgrp, &tset); | 2087 | retval = ss->can_attach(cgrp, &tset); |
2035 | if (retval) { | 2088 | if (retval) { |
@@ -2044,8 +2097,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2044 | * we use find_css_set, which allocates a new one if necessary. | 2097 | * we use find_css_set, which allocates a new one if necessary. |
2045 | */ | 2098 | */ |
2046 | for (i = 0; i < group_size; i++) { | 2099 | for (i = 0; i < group_size; i++) { |
2100 | struct css_set *old_cset; | ||
2101 | |||
2047 | tc = flex_array_get(group, i); | 2102 | tc = flex_array_get(group, i); |
2048 | tc->cg = find_css_set(tc->task->cgroups, cgrp); | 2103 | old_cset = task_css_set(tc->task); |
2104 | tc->cg = find_css_set(old_cset, cgrp); | ||
2049 | if (!tc->cg) { | 2105 | if (!tc->cg) { |
2050 | retval = -ENOMEM; | 2106 | retval = -ENOMEM; |
2051 | goto out_put_css_set_refs; | 2107 | goto out_put_css_set_refs; |
@@ -2066,7 +2122,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2066 | /* | 2122 | /* |
2067 | * step 4: do subsystem attach callbacks. | 2123 | * step 4: do subsystem attach callbacks. |
2068 | */ | 2124 | */ |
2069 | for_each_subsys(root, ss) { | 2125 | for_each_root_subsys(root, ss) { |
2070 | if (ss->attach) | 2126 | if (ss->attach) |
2071 | ss->attach(cgrp, &tset); | 2127 | ss->attach(cgrp, &tset); |
2072 | } | 2128 | } |
@@ -2086,7 +2142,7 @@ out_put_css_set_refs: | |||
2086 | } | 2142 | } |
2087 | out_cancel_attach: | 2143 | out_cancel_attach: |
2088 | if (retval) { | 2144 | if (retval) { |
2089 | for_each_subsys(root, ss) { | 2145 | for_each_root_subsys(root, ss) { |
2090 | if (ss == failed_ss) | 2146 | if (ss == failed_ss) |
2091 | break; | 2147 | break; |
2092 | if (ss->cancel_attach) | 2148 | if (ss->cancel_attach) |
@@ -2323,7 +2379,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, | |||
2323 | struct cftype *cft = __d_cft(file->f_dentry); | 2379 | struct cftype *cft = __d_cft(file->f_dentry); |
2324 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2380 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); |
2325 | 2381 | ||
2326 | if (cgroup_is_removed(cgrp)) | 2382 | if (cgroup_is_dead(cgrp)) |
2327 | return -ENODEV; | 2383 | return -ENODEV; |
2328 | if (cft->write) | 2384 | if (cft->write) |
2329 | return cft->write(cgrp, cft, file, buf, nbytes, ppos); | 2385 | return cft->write(cgrp, cft, file, buf, nbytes, ppos); |
@@ -2368,7 +2424,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, | |||
2368 | struct cftype *cft = __d_cft(file->f_dentry); | 2424 | struct cftype *cft = __d_cft(file->f_dentry); |
2369 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2425 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); |
2370 | 2426 | ||
2371 | if (cgroup_is_removed(cgrp)) | 2427 | if (cgroup_is_dead(cgrp)) |
2372 | return -ENODEV; | 2428 | return -ENODEV; |
2373 | 2429 | ||
2374 | if (cft->read) | 2430 | if (cft->read) |
@@ -2435,10 +2491,12 @@ static int cgroup_file_open(struct inode *inode, struct file *file) | |||
2435 | cft = __d_cft(file->f_dentry); | 2491 | cft = __d_cft(file->f_dentry); |
2436 | 2492 | ||
2437 | if (cft->read_map || cft->read_seq_string) { | 2493 | if (cft->read_map || cft->read_seq_string) { |
2438 | struct cgroup_seqfile_state *state = | 2494 | struct cgroup_seqfile_state *state; |
2439 | kzalloc(sizeof(*state), GFP_USER); | 2495 | |
2496 | state = kzalloc(sizeof(*state), GFP_USER); | ||
2440 | if (!state) | 2497 | if (!state) |
2441 | return -ENOMEM; | 2498 | return -ENOMEM; |
2499 | |||
2442 | state->cft = cft; | 2500 | state->cft = cft; |
2443 | state->cgroup = __d_cgrp(file->f_dentry->d_parent); | 2501 | state->cgroup = __d_cgrp(file->f_dentry->d_parent); |
2444 | file->f_op = &cgroup_seqfile_operations; | 2502 | file->f_op = &cgroup_seqfile_operations; |
@@ -2486,6 +2544,13 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
2486 | 2544 | ||
2487 | cgrp = __d_cgrp(old_dentry); | 2545 | cgrp = __d_cgrp(old_dentry); |
2488 | 2546 | ||
2547 | /* | ||
2548 | * This isn't a proper migration and its usefulness is very | ||
2549 | * limited. Disallow if sane_behavior. | ||
2550 | */ | ||
2551 | if (cgroup_sane_behavior(cgrp)) | ||
2552 | return -EPERM; | ||
2553 | |||
2489 | name = cgroup_alloc_name(new_dentry); | 2554 | name = cgroup_alloc_name(new_dentry); |
2490 | if (!name) | 2555 | if (!name) |
2491 | return -ENOMEM; | 2556 | return -ENOMEM; |
@@ -2496,7 +2561,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
2496 | return ret; | 2561 | return ret; |
2497 | } | 2562 | } |
2498 | 2563 | ||
2499 | old_name = cgrp->name; | 2564 | old_name = rcu_dereference_protected(cgrp->name, true); |
2500 | rcu_assign_pointer(cgrp->name, name); | 2565 | rcu_assign_pointer(cgrp->name, name); |
2501 | 2566 | ||
2502 | kfree_rcu(old_name, rcu_head); | 2567 | kfree_rcu(old_name, rcu_head); |
@@ -2747,58 +2812,78 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2747 | return ret; | 2812 | return ret; |
2748 | } | 2813 | } |
2749 | 2814 | ||
2750 | static DEFINE_MUTEX(cgroup_cft_mutex); | ||
2751 | |||
2752 | static void cgroup_cfts_prepare(void) | 2815 | static void cgroup_cfts_prepare(void) |
2753 | __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) | 2816 | __acquires(&cgroup_mutex) |
2754 | { | 2817 | { |
2755 | /* | 2818 | /* |
2756 | * Thanks to the entanglement with vfs inode locking, we can't walk | 2819 | * Thanks to the entanglement with vfs inode locking, we can't walk |
2757 | * the existing cgroups under cgroup_mutex and create files. | 2820 | * the existing cgroups under cgroup_mutex and create files. |
2758 | * Instead, we increment reference on all cgroups and build list of | 2821 | * Instead, we use cgroup_for_each_descendant_pre() and drop RCU |
2759 | * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure | 2822 | * read lock before calling cgroup_addrm_files(). |
2760 | * exclusive access to the field. | ||
2761 | */ | 2823 | */ |
2762 | mutex_lock(&cgroup_cft_mutex); | ||
2763 | mutex_lock(&cgroup_mutex); | 2824 | mutex_lock(&cgroup_mutex); |
2764 | } | 2825 | } |
2765 | 2826 | ||
2766 | static void cgroup_cfts_commit(struct cgroup_subsys *ss, | 2827 | static void cgroup_cfts_commit(struct cgroup_subsys *ss, |
2767 | struct cftype *cfts, bool is_add) | 2828 | struct cftype *cfts, bool is_add) |
2768 | __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) | 2829 | __releases(&cgroup_mutex) |
2769 | { | 2830 | { |
2770 | LIST_HEAD(pending); | 2831 | LIST_HEAD(pending); |
2771 | struct cgroup *cgrp, *n; | 2832 | struct cgroup *cgrp, *root = &ss->root->top_cgroup; |
2833 | struct super_block *sb = ss->root->sb; | ||
2834 | struct dentry *prev = NULL; | ||
2835 | struct inode *inode; | ||
2836 | u64 update_before; | ||
2772 | 2837 | ||
2773 | /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ | 2838 | /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ |
2774 | if (cfts && ss->root != &rootnode) { | 2839 | if (!cfts || ss->root == &cgroup_dummy_root || |
2775 | list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { | 2840 | !atomic_inc_not_zero(&sb->s_active)) { |
2776 | dget(cgrp->dentry); | 2841 | mutex_unlock(&cgroup_mutex); |
2777 | list_add_tail(&cgrp->cft_q_node, &pending); | 2842 | return; |
2778 | } | ||
2779 | } | 2843 | } |
2780 | 2844 | ||
2781 | mutex_unlock(&cgroup_mutex); | ||
2782 | |||
2783 | /* | 2845 | /* |
2784 | * All new cgroups will see @cfts update on @ss->cftsets. Add/rm | 2846 | * All cgroups which are created after we drop cgroup_mutex will |
2785 | * files for all cgroups which were created before. | 2847 | * have the updated set of files, so we only need to update the |
2848 | * cgroups created before the current @cgroup_serial_nr_next. | ||
2786 | */ | 2849 | */ |
2787 | list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { | 2850 | update_before = cgroup_serial_nr_next; |
2788 | struct inode *inode = cgrp->dentry->d_inode; | 2851 | |
2852 | mutex_unlock(&cgroup_mutex); | ||
2853 | |||
2854 | /* @root always needs to be updated */ | ||
2855 | inode = root->dentry->d_inode; | ||
2856 | mutex_lock(&inode->i_mutex); | ||
2857 | mutex_lock(&cgroup_mutex); | ||
2858 | cgroup_addrm_files(root, ss, cfts, is_add); | ||
2859 | mutex_unlock(&cgroup_mutex); | ||
2860 | mutex_unlock(&inode->i_mutex); | ||
2861 | |||
2862 | /* add/rm files for all cgroups created before */ | ||
2863 | rcu_read_lock(); | ||
2864 | cgroup_for_each_descendant_pre(cgrp, root) { | ||
2865 | if (cgroup_is_dead(cgrp)) | ||
2866 | continue; | ||
2867 | |||
2868 | inode = cgrp->dentry->d_inode; | ||
2869 | dget(cgrp->dentry); | ||
2870 | rcu_read_unlock(); | ||
2871 | |||
2872 | dput(prev); | ||
2873 | prev = cgrp->dentry; | ||
2789 | 2874 | ||
2790 | mutex_lock(&inode->i_mutex); | 2875 | mutex_lock(&inode->i_mutex); |
2791 | mutex_lock(&cgroup_mutex); | 2876 | mutex_lock(&cgroup_mutex); |
2792 | if (!cgroup_is_removed(cgrp)) | 2877 | if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) |
2793 | cgroup_addrm_files(cgrp, ss, cfts, is_add); | 2878 | cgroup_addrm_files(cgrp, ss, cfts, is_add); |
2794 | mutex_unlock(&cgroup_mutex); | 2879 | mutex_unlock(&cgroup_mutex); |
2795 | mutex_unlock(&inode->i_mutex); | 2880 | mutex_unlock(&inode->i_mutex); |
2796 | 2881 | ||
2797 | list_del_init(&cgrp->cft_q_node); | 2882 | rcu_read_lock(); |
2798 | dput(cgrp->dentry); | ||
2799 | } | 2883 | } |
2800 | 2884 | rcu_read_unlock(); | |
2801 | mutex_unlock(&cgroup_cft_mutex); | 2885 | dput(prev); |
2886 | deactivate_super(sb); | ||
2802 | } | 2887 | } |
2803 | 2888 | ||
2804 | /** | 2889 | /** |
@@ -2853,7 +2938,8 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | |||
2853 | 2938 | ||
2854 | list_for_each_entry(set, &ss->cftsets, node) { | 2939 | list_for_each_entry(set, &ss->cftsets, node) { |
2855 | if (set->cfts == cfts) { | 2940 | if (set->cfts == cfts) { |
2856 | list_del_init(&set->node); | 2941 | list_del(&set->node); |
2942 | kfree(set); | ||
2857 | cgroup_cfts_commit(ss, cfts, false); | 2943 | cgroup_cfts_commit(ss, cfts, false); |
2858 | return 0; | 2944 | return 0; |
2859 | } | 2945 | } |
@@ -2872,12 +2958,11 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | |||
2872 | int cgroup_task_count(const struct cgroup *cgrp) | 2958 | int cgroup_task_count(const struct cgroup *cgrp) |
2873 | { | 2959 | { |
2874 | int count = 0; | 2960 | int count = 0; |
2875 | struct cg_cgroup_link *link; | 2961 | struct cgrp_cset_link *link; |
2876 | 2962 | ||
2877 | read_lock(&css_set_lock); | 2963 | read_lock(&css_set_lock); |
2878 | list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { | 2964 | list_for_each_entry(link, &cgrp->cset_links, cset_link) |
2879 | count += atomic_read(&link->cg->refcount); | 2965 | count += atomic_read(&link->cset->refcount); |
2880 | } | ||
2881 | read_unlock(&css_set_lock); | 2966 | read_unlock(&css_set_lock); |
2882 | return count; | 2967 | return count; |
2883 | } | 2968 | } |
@@ -2886,25 +2971,24 @@ int cgroup_task_count(const struct cgroup *cgrp) | |||
2886 | * Advance a list_head iterator. The iterator should be positioned at | 2971 | * Advance a list_head iterator. The iterator should be positioned at |
2887 | * the start of a css_set | 2972 | * the start of a css_set |
2888 | */ | 2973 | */ |
2889 | static void cgroup_advance_iter(struct cgroup *cgrp, | 2974 | static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) |
2890 | struct cgroup_iter *it) | ||
2891 | { | 2975 | { |
2892 | struct list_head *l = it->cg_link; | 2976 | struct list_head *l = it->cset_link; |
2893 | struct cg_cgroup_link *link; | 2977 | struct cgrp_cset_link *link; |
2894 | struct css_set *cg; | 2978 | struct css_set *cset; |
2895 | 2979 | ||
2896 | /* Advance to the next non-empty css_set */ | 2980 | /* Advance to the next non-empty css_set */ |
2897 | do { | 2981 | do { |
2898 | l = l->next; | 2982 | l = l->next; |
2899 | if (l == &cgrp->css_sets) { | 2983 | if (l == &cgrp->cset_links) { |
2900 | it->cg_link = NULL; | 2984 | it->cset_link = NULL; |
2901 | return; | 2985 | return; |
2902 | } | 2986 | } |
2903 | link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); | 2987 | link = list_entry(l, struct cgrp_cset_link, cset_link); |
2904 | cg = link->cg; | 2988 | cset = link->cset; |
2905 | } while (list_empty(&cg->tasks)); | 2989 | } while (list_empty(&cset->tasks)); |
2906 | it->cg_link = l; | 2990 | it->cset_link = l; |
2907 | it->task = cg->tasks.next; | 2991 | it->task = cset->tasks.next; |
2908 | } | 2992 | } |
2909 | 2993 | ||
2910 | /* | 2994 | /* |
@@ -2934,7 +3018,7 @@ static void cgroup_enable_task_cg_lists(void) | |||
2934 | * entry won't be deleted though the process has exited. | 3018 | * entry won't be deleted though the process has exited. |
2935 | */ | 3019 | */ |
2936 | if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) | 3020 | if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) |
2937 | list_add(&p->cg_list, &p->cgroups->tasks); | 3021 | list_add(&p->cg_list, &task_css_set(p)->tasks); |
2938 | task_unlock(p); | 3022 | task_unlock(p); |
2939 | } while_each_thread(g, p); | 3023 | } while_each_thread(g, p); |
2940 | read_unlock(&tasklist_lock); | 3024 | read_unlock(&tasklist_lock); |
@@ -2942,12 +3026,67 @@ static void cgroup_enable_task_cg_lists(void) | |||
2942 | } | 3026 | } |
2943 | 3027 | ||
2944 | /** | 3028 | /** |
3029 | * cgroup_next_sibling - find the next sibling of a given cgroup | ||
3030 | * @pos: the current cgroup | ||
3031 | * | ||
3032 | * This function returns the next sibling of @pos and should be called | ||
3033 | * under RCU read lock. The only requirement is that @pos is accessible. | ||
3034 | * The next sibling is guaranteed to be returned regardless of @pos's | ||
3035 | * state. | ||
3036 | */ | ||
3037 | struct cgroup *cgroup_next_sibling(struct cgroup *pos) | ||
3038 | { | ||
3039 | struct cgroup *next; | ||
3040 | |||
3041 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
3042 | |||
3043 | /* | ||
3044 | * @pos could already have been removed. Once a cgroup is removed, | ||
3045 | * its ->sibling.next is no longer updated when its next sibling | ||
3046 | * changes. As CGRP_DEAD assertion is serialized and happens | ||
3047 | * before the cgroup is taken off the ->sibling list, if we see it | ||
3048 | * unasserted, it's guaranteed that the next sibling hasn't | ||
3049 | * finished its grace period even if it's already removed, and thus | ||
3050 | * safe to dereference from this RCU critical section. If | ||
3051 | * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed | ||
3052 | * to be visible as %true here. | ||
3053 | */ | ||
3054 | if (likely(!cgroup_is_dead(pos))) { | ||
3055 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); | ||
3056 | if (&next->sibling != &pos->parent->children) | ||
3057 | return next; | ||
3058 | return NULL; | ||
3059 | } | ||
3060 | |||
3061 | /* | ||
3062 | * Can't dereference the next pointer. Each cgroup is given a | ||
3063 | * monotonically increasing unique serial number and always | ||
3064 | * appended to the sibling list, so the next one can be found by | ||
3065 | * walking the parent's children until we see a cgroup with higher | ||
3066 | * serial number than @pos's. | ||
3067 | * | ||
3068 | * While this path can be slow, it's taken only when either the | ||
3069 | * current cgroup is removed or iteration and removal race. | ||
3070 | */ | ||
3071 | list_for_each_entry_rcu(next, &pos->parent->children, sibling) | ||
3072 | if (next->serial_nr > pos->serial_nr) | ||
3073 | return next; | ||
3074 | return NULL; | ||
3075 | } | ||
3076 | EXPORT_SYMBOL_GPL(cgroup_next_sibling); | ||
3077 | |||
3078 | /** | ||
2945 | * cgroup_next_descendant_pre - find the next descendant for pre-order walk | 3079 | * cgroup_next_descendant_pre - find the next descendant for pre-order walk |
2946 | * @pos: the current position (%NULL to initiate traversal) | 3080 | * @pos: the current position (%NULL to initiate traversal) |
2947 | * @cgroup: cgroup whose descendants to walk | 3081 | * @cgroup: cgroup whose descendants to walk |
2948 | * | 3082 | * |
2949 | * To be used by cgroup_for_each_descendant_pre(). Find the next | 3083 | * To be used by cgroup_for_each_descendant_pre(). Find the next |
2950 | * descendant to visit for pre-order traversal of @cgroup's descendants. | 3084 | * descendant to visit for pre-order traversal of @cgroup's descendants. |
3085 | * | ||
3086 | * While this function requires RCU read locking, it doesn't require the | ||
3087 | * whole traversal to be contained in a single RCU critical section. This | ||
3088 | * function will return the correct next descendant as long as both @pos | ||
3089 | * and @cgroup are accessible and @pos is a descendant of @cgroup. | ||
2951 | */ | 3090 | */ |
2952 | struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, | 3091 | struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, |
2953 | struct cgroup *cgroup) | 3092 | struct cgroup *cgroup) |
@@ -2967,11 +3106,9 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, | |||
2967 | 3106 | ||
2968 | /* no child, visit my or the closest ancestor's next sibling */ | 3107 | /* no child, visit my or the closest ancestor's next sibling */ |
2969 | while (pos != cgroup) { | 3108 | while (pos != cgroup) { |
2970 | next = list_entry_rcu(pos->sibling.next, struct cgroup, | 3109 | next = cgroup_next_sibling(pos); |
2971 | sibling); | 3110 | if (next) |
2972 | if (&next->sibling != &pos->parent->children) | ||
2973 | return next; | 3111 | return next; |
2974 | |||
2975 | pos = pos->parent; | 3112 | pos = pos->parent; |
2976 | } | 3113 | } |
2977 | 3114 | ||
@@ -2986,6 +3123,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); | |||
2986 | * Return the rightmost descendant of @pos. If there's no descendant, | 3123 | * Return the rightmost descendant of @pos. If there's no descendant, |
2987 | * @pos is returned. This can be used during pre-order traversal to skip | 3124 | * @pos is returned. This can be used during pre-order traversal to skip |
2988 | * subtree of @pos. | 3125 | * subtree of @pos. |
3126 | * | ||
3127 | * While this function requires RCU read locking, it doesn't require the | ||
3128 | * whole traversal to be contained in a single RCU critical section. This | ||
3129 | * function will return the correct rightmost descendant as long as @pos is | ||
3130 | * accessible. | ||
2989 | */ | 3131 | */ |
2990 | struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) | 3132 | struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) |
2991 | { | 3133 | { |
@@ -3025,6 +3167,11 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) | |||
3025 | * | 3167 | * |
3026 | * To be used by cgroup_for_each_descendant_post(). Find the next | 3168 | * To be used by cgroup_for_each_descendant_post(). Find the next |
3027 | * descendant to visit for post-order traversal of @cgroup's descendants. | 3169 | * descendant to visit for post-order traversal of @cgroup's descendants. |
3170 | * | ||
3171 | * While this function requires RCU read locking, it doesn't require the | ||
3172 | * whole traversal to be contained in a single RCU critical section. This | ||
3173 | * function will return the correct next descendant as long as both @pos | ||
3174 | * and @cgroup are accessible and @pos is a descendant of @cgroup. | ||
3028 | */ | 3175 | */ |
3029 | struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, | 3176 | struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, |
3030 | struct cgroup *cgroup) | 3177 | struct cgroup *cgroup) |
@@ -3040,8 +3187,8 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, | |||
3040 | } | 3187 | } |
3041 | 3188 | ||
3042 | /* if there's an unvisited sibling, visit its leftmost descendant */ | 3189 | /* if there's an unvisited sibling, visit its leftmost descendant */ |
3043 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); | 3190 | next = cgroup_next_sibling(pos); |
3044 | if (&next->sibling != &pos->parent->children) | 3191 | if (next) |
3045 | return cgroup_leftmost_descendant(next); | 3192 | return cgroup_leftmost_descendant(next); |
3046 | 3193 | ||
3047 | /* no sibling left, visit parent */ | 3194 | /* no sibling left, visit parent */ |
@@ -3062,7 +3209,7 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | |||
3062 | cgroup_enable_task_cg_lists(); | 3209 | cgroup_enable_task_cg_lists(); |
3063 | 3210 | ||
3064 | read_lock(&css_set_lock); | 3211 | read_lock(&css_set_lock); |
3065 | it->cg_link = &cgrp->css_sets; | 3212 | it->cset_link = &cgrp->cset_links; |
3066 | cgroup_advance_iter(cgrp, it); | 3213 | cgroup_advance_iter(cgrp, it); |
3067 | } | 3214 | } |
3068 | 3215 | ||
@@ -3071,16 +3218,16 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, | |||
3071 | { | 3218 | { |
3072 | struct task_struct *res; | 3219 | struct task_struct *res; |
3073 | struct list_head *l = it->task; | 3220 | struct list_head *l = it->task; |
3074 | struct cg_cgroup_link *link; | 3221 | struct cgrp_cset_link *link; |
3075 | 3222 | ||
3076 | /* If the iterator cg is NULL, we have no tasks */ | 3223 | /* If the iterator cg is NULL, we have no tasks */ |
3077 | if (!it->cg_link) | 3224 | if (!it->cset_link) |
3078 | return NULL; | 3225 | return NULL; |
3079 | res = list_entry(l, struct task_struct, cg_list); | 3226 | res = list_entry(l, struct task_struct, cg_list); |
3080 | /* Advance iterator to find next entry */ | 3227 | /* Advance iterator to find next entry */ |
3081 | l = l->next; | 3228 | l = l->next; |
3082 | link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); | 3229 | link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); |
3083 | if (l == &link->cg->tasks) { | 3230 | if (l == &link->cset->tasks) { |
3084 | /* We reached the end of this task list - move on to | 3231 | /* We reached the end of this task list - move on to |
3085 | * the next cg_cgroup_link */ | 3232 | * the next cg_cgroup_link */ |
3086 | cgroup_advance_iter(cgrp, it); | 3233 | cgroup_advance_iter(cgrp, it); |
@@ -3411,7 +3558,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
3411 | } | 3558 | } |
3412 | } | 3559 | } |
3413 | /* entry not found; create a new one */ | 3560 | /* entry not found; create a new one */ |
3414 | l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); | 3561 | l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); |
3415 | if (!l) { | 3562 | if (!l) { |
3416 | mutex_unlock(&cgrp->pidlist_mutex); | 3563 | mutex_unlock(&cgrp->pidlist_mutex); |
3417 | return l; | 3564 | return l; |
@@ -3420,8 +3567,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
3420 | down_write(&l->mutex); | 3567 | down_write(&l->mutex); |
3421 | l->key.type = type; | 3568 | l->key.type = type; |
3422 | l->key.ns = get_pid_ns(ns); | 3569 | l->key.ns = get_pid_ns(ns); |
3423 | l->use_count = 0; /* don't increment here */ | ||
3424 | l->list = NULL; | ||
3425 | l->owner = cgrp; | 3570 | l->owner = cgrp; |
3426 | list_add(&l->links, &cgrp->pidlists); | 3571 | list_add(&l->links, &cgrp->pidlists); |
3427 | mutex_unlock(&cgrp->pidlist_mutex); | 3572 | mutex_unlock(&cgrp->pidlist_mutex); |
@@ -3727,6 +3872,23 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp, | |||
3727 | } | 3872 | } |
3728 | 3873 | ||
3729 | /* | 3874 | /* |
3875 | * When dput() is called asynchronously, if umount has been done and | ||
3876 | * then deactivate_super() in cgroup_free_fn() kills the superblock, | ||
3877 | * there's a small window that vfs will see the root dentry with non-zero | ||
3878 | * refcnt and trigger BUG(). | ||
3879 | * | ||
3880 | * That's why we hold a reference before dput() and drop it right after. | ||
3881 | */ | ||
3882 | static void cgroup_dput(struct cgroup *cgrp) | ||
3883 | { | ||
3884 | struct super_block *sb = cgrp->root->sb; | ||
3885 | |||
3886 | atomic_inc(&sb->s_active); | ||
3887 | dput(cgrp->dentry); | ||
3888 | deactivate_super(sb); | ||
3889 | } | ||
3890 | |||
3891 | /* | ||
3730 | * Unregister event and free resources. | 3892 | * Unregister event and free resources. |
3731 | * | 3893 | * |
3732 | * Gets called from workqueue. | 3894 | * Gets called from workqueue. |
@@ -3746,7 +3908,7 @@ static void cgroup_event_remove(struct work_struct *work) | |||
3746 | 3908 | ||
3747 | eventfd_ctx_put(event->eventfd); | 3909 | eventfd_ctx_put(event->eventfd); |
3748 | kfree(event); | 3910 | kfree(event); |
3749 | dput(cgrp->dentry); | 3911 | cgroup_dput(cgrp); |
3750 | } | 3912 | } |
3751 | 3913 | ||
3752 | /* | 3914 | /* |
@@ -3933,33 +4095,16 @@ static int cgroup_clone_children_write(struct cgroup *cgrp, | |||
3933 | return 0; | 4095 | return 0; |
3934 | } | 4096 | } |
3935 | 4097 | ||
3936 | /* | 4098 | static struct cftype cgroup_base_files[] = { |
3937 | * for the common functions, 'private' gives the type of file | ||
3938 | */ | ||
3939 | /* for hysterical raisins, we can't put this on the older files */ | ||
3940 | #define CGROUP_FILE_GENERIC_PREFIX "cgroup." | ||
3941 | static struct cftype files[] = { | ||
3942 | { | ||
3943 | .name = "tasks", | ||
3944 | .open = cgroup_tasks_open, | ||
3945 | .write_u64 = cgroup_tasks_write, | ||
3946 | .release = cgroup_pidlist_release, | ||
3947 | .mode = S_IRUGO | S_IWUSR, | ||
3948 | }, | ||
3949 | { | 4099 | { |
3950 | .name = CGROUP_FILE_GENERIC_PREFIX "procs", | 4100 | .name = "cgroup.procs", |
3951 | .open = cgroup_procs_open, | 4101 | .open = cgroup_procs_open, |
3952 | .write_u64 = cgroup_procs_write, | 4102 | .write_u64 = cgroup_procs_write, |
3953 | .release = cgroup_pidlist_release, | 4103 | .release = cgroup_pidlist_release, |
3954 | .mode = S_IRUGO | S_IWUSR, | 4104 | .mode = S_IRUGO | S_IWUSR, |
3955 | }, | 4105 | }, |
3956 | { | 4106 | { |
3957 | .name = "notify_on_release", | 4107 | .name = "cgroup.event_control", |
3958 | .read_u64 = cgroup_read_notify_on_release, | ||
3959 | .write_u64 = cgroup_write_notify_on_release, | ||
3960 | }, | ||
3961 | { | ||
3962 | .name = CGROUP_FILE_GENERIC_PREFIX "event_control", | ||
3963 | .write_string = cgroup_write_event_control, | 4108 | .write_string = cgroup_write_event_control, |
3964 | .mode = S_IWUGO, | 4109 | .mode = S_IWUGO, |
3965 | }, | 4110 | }, |
@@ -3974,9 +4119,29 @@ static struct cftype files[] = { | |||
3974 | .flags = CFTYPE_ONLY_ON_ROOT, | 4119 | .flags = CFTYPE_ONLY_ON_ROOT, |
3975 | .read_seq_string = cgroup_sane_behavior_show, | 4120 | .read_seq_string = cgroup_sane_behavior_show, |
3976 | }, | 4121 | }, |
4122 | |||
4123 | /* | ||
4124 | * Historical crazy stuff. These don't have "cgroup." prefix and | ||
4125 | * don't exist if sane_behavior. If you're depending on these, be | ||
4126 | * prepared to be burned. | ||
4127 | */ | ||
4128 | { | ||
4129 | .name = "tasks", | ||
4130 | .flags = CFTYPE_INSANE, /* use "procs" instead */ | ||
4131 | .open = cgroup_tasks_open, | ||
4132 | .write_u64 = cgroup_tasks_write, | ||
4133 | .release = cgroup_pidlist_release, | ||
4134 | .mode = S_IRUGO | S_IWUSR, | ||
4135 | }, | ||
4136 | { | ||
4137 | .name = "notify_on_release", | ||
4138 | .flags = CFTYPE_INSANE, | ||
4139 | .read_u64 = cgroup_read_notify_on_release, | ||
4140 | .write_u64 = cgroup_write_notify_on_release, | ||
4141 | }, | ||
3977 | { | 4142 | { |
3978 | .name = "release_agent", | 4143 | .name = "release_agent", |
3979 | .flags = CFTYPE_ONLY_ON_ROOT, | 4144 | .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, |
3980 | .read_seq_string = cgroup_release_agent_show, | 4145 | .read_seq_string = cgroup_release_agent_show, |
3981 | .write_string = cgroup_release_agent_write, | 4146 | .write_string = cgroup_release_agent_write, |
3982 | .max_write_len = PATH_MAX, | 4147 | .max_write_len = PATH_MAX, |
@@ -3997,13 +4162,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, | |||
3997 | struct cgroup_subsys *ss; | 4162 | struct cgroup_subsys *ss; |
3998 | 4163 | ||
3999 | if (base_files) { | 4164 | if (base_files) { |
4000 | err = cgroup_addrm_files(cgrp, NULL, files, true); | 4165 | err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); |
4001 | if (err < 0) | 4166 | if (err < 0) |
4002 | return err; | 4167 | return err; |
4003 | } | 4168 | } |
4004 | 4169 | ||
4005 | /* process cftsets of each subsystem */ | 4170 | /* process cftsets of each subsystem */ |
4006 | for_each_subsys(cgrp->root, ss) { | 4171 | for_each_root_subsys(cgrp->root, ss) { |
4007 | struct cftype_set *set; | 4172 | struct cftype_set *set; |
4008 | if (!test_bit(ss->subsys_id, &subsys_mask)) | 4173 | if (!test_bit(ss->subsys_id, &subsys_mask)) |
4009 | continue; | 4174 | continue; |
@@ -4013,15 +4178,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, | |||
4013 | } | 4178 | } |
4014 | 4179 | ||
4015 | /* This cgroup is ready now */ | 4180 | /* This cgroup is ready now */ |
4016 | for_each_subsys(cgrp->root, ss) { | 4181 | for_each_root_subsys(cgrp->root, ss) { |
4017 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4182 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
4183 | struct css_id *id = rcu_dereference_protected(css->id, true); | ||
4184 | |||
4018 | /* | 4185 | /* |
4019 | * Update id->css pointer and make this css visible from | 4186 | * Update id->css pointer and make this css visible from |
4020 | * CSS ID functions. This pointer will be dereferened | 4187 | * CSS ID functions. This pointer will be dereferened |
4021 | * from RCU-read-side without locks. | 4188 | * from RCU-read-side without locks. |
4022 | */ | 4189 | */ |
4023 | if (css->id) | 4190 | if (id) |
4024 | rcu_assign_pointer(css->id->css, css); | 4191 | rcu_assign_pointer(id->css, css); |
4025 | } | 4192 | } |
4026 | 4193 | ||
4027 | return 0; | 4194 | return 0; |
@@ -4031,12 +4198,16 @@ static void css_dput_fn(struct work_struct *work) | |||
4031 | { | 4198 | { |
4032 | struct cgroup_subsys_state *css = | 4199 | struct cgroup_subsys_state *css = |
4033 | container_of(work, struct cgroup_subsys_state, dput_work); | 4200 | container_of(work, struct cgroup_subsys_state, dput_work); |
4034 | struct dentry *dentry = css->cgroup->dentry; | ||
4035 | struct super_block *sb = dentry->d_sb; | ||
4036 | 4201 | ||
4037 | atomic_inc(&sb->s_active); | 4202 | cgroup_dput(css->cgroup); |
4038 | dput(dentry); | 4203 | } |
4039 | deactivate_super(sb); | 4204 | |
4205 | static void css_release(struct percpu_ref *ref) | ||
4206 | { | ||
4207 | struct cgroup_subsys_state *css = | ||
4208 | container_of(ref, struct cgroup_subsys_state, refcnt); | ||
4209 | |||
4210 | schedule_work(&css->dput_work); | ||
4040 | } | 4211 | } |
4041 | 4212 | ||
4042 | static void init_cgroup_css(struct cgroup_subsys_state *css, | 4213 | static void init_cgroup_css(struct cgroup_subsys_state *css, |
@@ -4044,10 +4215,9 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, | |||
4044 | struct cgroup *cgrp) | 4215 | struct cgroup *cgrp) |
4045 | { | 4216 | { |
4046 | css->cgroup = cgrp; | 4217 | css->cgroup = cgrp; |
4047 | atomic_set(&css->refcnt, 1); | ||
4048 | css->flags = 0; | 4218 | css->flags = 0; |
4049 | css->id = NULL; | 4219 | css->id = NULL; |
4050 | if (cgrp == dummytop) | 4220 | if (cgrp == cgroup_dummy_top) |
4051 | css->flags |= CSS_ROOT; | 4221 | css->flags |= CSS_ROOT; |
4052 | BUG_ON(cgrp->subsys[ss->subsys_id]); | 4222 | BUG_ON(cgrp->subsys[ss->subsys_id]); |
4053 | cgrp->subsys[ss->subsys_id] = css; | 4223 | cgrp->subsys[ss->subsys_id] = css; |
@@ -4157,7 +4327,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4157 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) | 4327 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) |
4158 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); | 4328 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); |
4159 | 4329 | ||
4160 | for_each_subsys(root, ss) { | 4330 | for_each_root_subsys(root, ss) { |
4161 | struct cgroup_subsys_state *css; | 4331 | struct cgroup_subsys_state *css; |
4162 | 4332 | ||
4163 | css = ss->css_alloc(cgrp); | 4333 | css = ss->css_alloc(cgrp); |
@@ -4165,7 +4335,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4165 | err = PTR_ERR(css); | 4335 | err = PTR_ERR(css); |
4166 | goto err_free_all; | 4336 | goto err_free_all; |
4167 | } | 4337 | } |
4338 | |||
4339 | err = percpu_ref_init(&css->refcnt, css_release); | ||
4340 | if (err) | ||
4341 | goto err_free_all; | ||
4342 | |||
4168 | init_cgroup_css(css, ss, cgrp); | 4343 | init_cgroup_css(css, ss, cgrp); |
4344 | |||
4169 | if (ss->use_id) { | 4345 | if (ss->use_id) { |
4170 | err = alloc_css_id(ss, parent, cgrp); | 4346 | err = alloc_css_id(ss, parent, cgrp); |
4171 | if (err) | 4347 | if (err) |
@@ -4183,20 +4359,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4183 | goto err_free_all; | 4359 | goto err_free_all; |
4184 | lockdep_assert_held(&dentry->d_inode->i_mutex); | 4360 | lockdep_assert_held(&dentry->d_inode->i_mutex); |
4185 | 4361 | ||
4362 | cgrp->serial_nr = cgroup_serial_nr_next++; | ||
4363 | |||
4186 | /* allocation complete, commit to creation */ | 4364 | /* allocation complete, commit to creation */ |
4187 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | ||
4188 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); | 4365 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); |
4189 | root->number_of_cgroups++; | 4366 | root->number_of_cgroups++; |
4190 | 4367 | ||
4191 | /* each css holds a ref to the cgroup's dentry */ | 4368 | /* each css holds a ref to the cgroup's dentry */ |
4192 | for_each_subsys(root, ss) | 4369 | for_each_root_subsys(root, ss) |
4193 | dget(dentry); | 4370 | dget(dentry); |
4194 | 4371 | ||
4195 | /* hold a ref to the parent's dentry */ | 4372 | /* hold a ref to the parent's dentry */ |
4196 | dget(parent->dentry); | 4373 | dget(parent->dentry); |
4197 | 4374 | ||
4198 | /* creation succeeded, notify subsystems */ | 4375 | /* creation succeeded, notify subsystems */ |
4199 | for_each_subsys(root, ss) { | 4376 | for_each_root_subsys(root, ss) { |
4200 | err = online_css(ss, cgrp); | 4377 | err = online_css(ss, cgrp); |
4201 | if (err) | 4378 | if (err) |
4202 | goto err_destroy; | 4379 | goto err_destroy; |
@@ -4221,9 +4398,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4221 | return 0; | 4398 | return 0; |
4222 | 4399 | ||
4223 | err_free_all: | 4400 | err_free_all: |
4224 | for_each_subsys(root, ss) { | 4401 | for_each_root_subsys(root, ss) { |
4225 | if (cgrp->subsys[ss->subsys_id]) | 4402 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
4403 | |||
4404 | if (css) { | ||
4405 | percpu_ref_cancel_init(&css->refcnt); | ||
4226 | ss->css_free(cgrp); | 4406 | ss->css_free(cgrp); |
4407 | } | ||
4227 | } | 4408 | } |
4228 | mutex_unlock(&cgroup_mutex); | 4409 | mutex_unlock(&cgroup_mutex); |
4229 | /* Release the reference count that we took on the superblock */ | 4410 | /* Release the reference count that we took on the superblock */ |
@@ -4251,63 +4432,120 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
4251 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); | 4432 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); |
4252 | } | 4433 | } |
4253 | 4434 | ||
4435 | static void cgroup_css_killed(struct cgroup *cgrp) | ||
4436 | { | ||
4437 | if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) | ||
4438 | return; | ||
4439 | |||
4440 | /* percpu ref's of all css's are killed, kick off the next step */ | ||
4441 | INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); | ||
4442 | schedule_work(&cgrp->destroy_work); | ||
4443 | } | ||
4444 | |||
4445 | static void css_ref_killed_fn(struct percpu_ref *ref) | ||
4446 | { | ||
4447 | struct cgroup_subsys_state *css = | ||
4448 | container_of(ref, struct cgroup_subsys_state, refcnt); | ||
4449 | |||
4450 | cgroup_css_killed(css->cgroup); | ||
4451 | } | ||
4452 | |||
4453 | /** | ||
4454 | * cgroup_destroy_locked - the first stage of cgroup destruction | ||
4455 | * @cgrp: cgroup to be destroyed | ||
4456 | * | ||
4457 | * css's make use of percpu refcnts whose killing latency shouldn't be | ||
4458 | * exposed to userland and are RCU protected. Also, cgroup core needs to | ||
4459 | * guarantee that css_tryget() won't succeed by the time ->css_offline() is | ||
4460 | * invoked. To satisfy all the requirements, destruction is implemented in | ||
4461 | * the following two steps. | ||
4462 | * | ||
4463 | * s1. Verify @cgrp can be destroyed and mark it dying. Remove all | ||
4464 | * userland visible parts and start killing the percpu refcnts of | ||
4465 | * css's. Set up so that the next stage will be kicked off once all | ||
4466 | * the percpu refcnts are confirmed to be killed. | ||
4467 | * | ||
4468 | * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the | ||
4469 | * rest of destruction. Once all cgroup references are gone, the | ||
4470 | * cgroup is RCU-freed. | ||
4471 | * | ||
4472 | * This function implements s1. After this step, @cgrp is gone as far as | ||
4473 | * the userland is concerned and a new cgroup with the same name may be | ||
4474 | * created. As cgroup doesn't care about the names internally, this | ||
4475 | * doesn't cause any problem. | ||
4476 | */ | ||
4254 | static int cgroup_destroy_locked(struct cgroup *cgrp) | 4477 | static int cgroup_destroy_locked(struct cgroup *cgrp) |
4255 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | 4478 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) |
4256 | { | 4479 | { |
4257 | struct dentry *d = cgrp->dentry; | 4480 | struct dentry *d = cgrp->dentry; |
4258 | struct cgroup *parent = cgrp->parent; | ||
4259 | struct cgroup_event *event, *tmp; | 4481 | struct cgroup_event *event, *tmp; |
4260 | struct cgroup_subsys *ss; | 4482 | struct cgroup_subsys *ss; |
4483 | bool empty; | ||
4261 | 4484 | ||
4262 | lockdep_assert_held(&d->d_inode->i_mutex); | 4485 | lockdep_assert_held(&d->d_inode->i_mutex); |
4263 | lockdep_assert_held(&cgroup_mutex); | 4486 | lockdep_assert_held(&cgroup_mutex); |
4264 | 4487 | ||
4265 | if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) | 4488 | /* |
4489 | * css_set_lock synchronizes access to ->cset_links and prevents | ||
4490 | * @cgrp from being removed while __put_css_set() is in progress. | ||
4491 | */ | ||
4492 | read_lock(&css_set_lock); | ||
4493 | empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children); | ||
4494 | read_unlock(&css_set_lock); | ||
4495 | if (!empty) | ||
4266 | return -EBUSY; | 4496 | return -EBUSY; |
4267 | 4497 | ||
4268 | /* | 4498 | /* |
4269 | * Block new css_tryget() by deactivating refcnt and mark @cgrp | 4499 | * Block new css_tryget() by killing css refcnts. cgroup core |
4270 | * removed. This makes future css_tryget() and child creation | 4500 | * guarantees that, by the time ->css_offline() is invoked, no new |
4271 | * attempts fail thus maintaining the removal conditions verified | 4501 | * css reference will be given out via css_tryget(). We can't |
4272 | * above. | 4502 | * simply call percpu_ref_kill() and proceed to offlining css's |
4503 | * because percpu_ref_kill() doesn't guarantee that the ref is seen | ||
4504 | * as killed on all CPUs on return. | ||
4505 | * | ||
4506 | * Use percpu_ref_kill_and_confirm() to get notifications as each | ||
4507 | * css is confirmed to be seen as killed on all CPUs. The | ||
4508 | * notification callback keeps track of the number of css's to be | ||
4509 | * killed and schedules cgroup_offline_fn() to perform the rest of | ||
4510 | * destruction once the percpu refs of all css's are confirmed to | ||
4511 | * be killed. | ||
4273 | */ | 4512 | */ |
4274 | for_each_subsys(cgrp->root, ss) { | 4513 | atomic_set(&cgrp->css_kill_cnt, 1); |
4514 | for_each_root_subsys(cgrp->root, ss) { | ||
4275 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4515 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; |
4276 | 4516 | ||
4277 | WARN_ON(atomic_read(&css->refcnt) < 0); | 4517 | /* |
4278 | atomic_add(CSS_DEACT_BIAS, &css->refcnt); | 4518 | * Killing would put the base ref, but we need to keep it |
4279 | } | 4519 | * alive until after ->css_offline. |
4280 | set_bit(CGRP_REMOVED, &cgrp->flags); | 4520 | */ |
4521 | percpu_ref_get(&css->refcnt); | ||
4281 | 4522 | ||
4282 | /* tell subsystems to initate destruction */ | 4523 | atomic_inc(&cgrp->css_kill_cnt); |
4283 | for_each_subsys(cgrp->root, ss) | 4524 | percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn); |
4284 | offline_css(ss, cgrp); | 4525 | } |
4526 | cgroup_css_killed(cgrp); | ||
4285 | 4527 | ||
4286 | /* | 4528 | /* |
4287 | * Put all the base refs. Each css holds an extra reference to the | 4529 | * Mark @cgrp dead. This prevents further task migration and child |
4288 | * cgroup's dentry and cgroup removal proceeds regardless of css | 4530 | * creation by disabling cgroup_lock_live_group(). Note that |
4289 | * refs. On the last put of each css, whenever that may be, the | 4531 | * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to |
4290 | * extra dentry ref is put so that dentry destruction happens only | 4532 | * resume iteration after dropping RCU read lock. See |
4291 | * after all css's are released. | 4533 | * cgroup_next_sibling() for details. |
4292 | */ | 4534 | */ |
4293 | for_each_subsys(cgrp->root, ss) | 4535 | set_bit(CGRP_DEAD, &cgrp->flags); |
4294 | css_put(cgrp->subsys[ss->subsys_id]); | ||
4295 | 4536 | ||
4537 | /* CGRP_DEAD is set, remove from ->release_list for the last time */ | ||
4296 | raw_spin_lock(&release_list_lock); | 4538 | raw_spin_lock(&release_list_lock); |
4297 | if (!list_empty(&cgrp->release_list)) | 4539 | if (!list_empty(&cgrp->release_list)) |
4298 | list_del_init(&cgrp->release_list); | 4540 | list_del_init(&cgrp->release_list); |
4299 | raw_spin_unlock(&release_list_lock); | 4541 | raw_spin_unlock(&release_list_lock); |
4300 | 4542 | ||
4301 | /* delete this cgroup from parent->children */ | 4543 | /* |
4302 | list_del_rcu(&cgrp->sibling); | 4544 | * Remove @cgrp directory. The removal puts the base ref but we |
4303 | list_del_init(&cgrp->allcg_node); | 4545 | * aren't quite done with @cgrp yet, so hold onto it. |
4304 | 4546 | */ | |
4305 | dget(d); | 4547 | dget(d); |
4306 | cgroup_d_remove_dir(d); | 4548 | cgroup_d_remove_dir(d); |
4307 | dput(d); | ||
4308 | |||
4309 | set_bit(CGRP_RELEASABLE, &parent->flags); | ||
4310 | check_for_release(parent); | ||
4311 | 4549 | ||
4312 | /* | 4550 | /* |
4313 | * Unregister events and notify userspace. | 4551 | * Unregister events and notify userspace. |
@@ -4322,6 +4560,53 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4322 | spin_unlock(&cgrp->event_list_lock); | 4560 | spin_unlock(&cgrp->event_list_lock); |
4323 | 4561 | ||
4324 | return 0; | 4562 | return 0; |
4563 | }; | ||
4564 | |||
4565 | /** | ||
4566 | * cgroup_offline_fn - the second step of cgroup destruction | ||
4567 | * @work: cgroup->destroy_free_work | ||
4568 | * | ||
4569 | * This function is invoked from a work item for a cgroup which is being | ||
4570 | * destroyed after the percpu refcnts of all css's are guaranteed to be | ||
4571 | * seen as killed on all CPUs, and performs the rest of destruction. This | ||
4572 | * is the second step of destruction described in the comment above | ||
4573 | * cgroup_destroy_locked(). | ||
4574 | */ | ||
4575 | static void cgroup_offline_fn(struct work_struct *work) | ||
4576 | { | ||
4577 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); | ||
4578 | struct cgroup *parent = cgrp->parent; | ||
4579 | struct dentry *d = cgrp->dentry; | ||
4580 | struct cgroup_subsys *ss; | ||
4581 | |||
4582 | mutex_lock(&cgroup_mutex); | ||
4583 | |||
4584 | /* | ||
4585 | * css_tryget() is guaranteed to fail now. Tell subsystems to | ||
4586 | * initate destruction. | ||
4587 | */ | ||
4588 | for_each_root_subsys(cgrp->root, ss) | ||
4589 | offline_css(ss, cgrp); | ||
4590 | |||
4591 | /* | ||
4592 | * Put the css refs from cgroup_destroy_locked(). Each css holds | ||
4593 | * an extra reference to the cgroup's dentry and cgroup removal | ||
4594 | * proceeds regardless of css refs. On the last put of each css, | ||
4595 | * whenever that may be, the extra dentry ref is put so that dentry | ||
4596 | * destruction happens only after all css's are released. | ||
4597 | */ | ||
4598 | for_each_root_subsys(cgrp->root, ss) | ||
4599 | css_put(cgrp->subsys[ss->subsys_id]); | ||
4600 | |||
4601 | /* delete this cgroup from parent->children */ | ||
4602 | list_del_rcu(&cgrp->sibling); | ||
4603 | |||
4604 | dput(d); | ||
4605 | |||
4606 | set_bit(CGRP_RELEASABLE, &parent->flags); | ||
4607 | check_for_release(parent); | ||
4608 | |||
4609 | mutex_unlock(&cgroup_mutex); | ||
4325 | } | 4610 | } |
4326 | 4611 | ||
4327 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | 4612 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) |
@@ -4361,12 +4646,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4361 | cgroup_init_cftsets(ss); | 4646 | cgroup_init_cftsets(ss); |
4362 | 4647 | ||
4363 | /* Create the top cgroup state for this subsystem */ | 4648 | /* Create the top cgroup state for this subsystem */ |
4364 | list_add(&ss->sibling, &rootnode.subsys_list); | 4649 | list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); |
4365 | ss->root = &rootnode; | 4650 | ss->root = &cgroup_dummy_root; |
4366 | css = ss->css_alloc(dummytop); | 4651 | css = ss->css_alloc(cgroup_dummy_top); |
4367 | /* We don't handle early failures gracefully */ | 4652 | /* We don't handle early failures gracefully */ |
4368 | BUG_ON(IS_ERR(css)); | 4653 | BUG_ON(IS_ERR(css)); |
4369 | init_cgroup_css(css, ss, dummytop); | 4654 | init_cgroup_css(css, ss, cgroup_dummy_top); |
4370 | 4655 | ||
4371 | /* Update the init_css_set to contain a subsys | 4656 | /* Update the init_css_set to contain a subsys |
4372 | * pointer to this state - since the subsystem is | 4657 | * pointer to this state - since the subsystem is |
@@ -4381,7 +4666,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4381 | * need to invoke fork callbacks here. */ | 4666 | * need to invoke fork callbacks here. */ |
4382 | BUG_ON(!list_empty(&init_task.tasks)); | 4667 | BUG_ON(!list_empty(&init_task.tasks)); |
4383 | 4668 | ||
4384 | BUG_ON(online_css(ss, dummytop)); | 4669 | BUG_ON(online_css(ss, cgroup_dummy_top)); |
4385 | 4670 | ||
4386 | mutex_unlock(&cgroup_mutex); | 4671 | mutex_unlock(&cgroup_mutex); |
4387 | 4672 | ||
@@ -4404,7 +4689,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4404 | struct cgroup_subsys_state *css; | 4689 | struct cgroup_subsys_state *css; |
4405 | int i, ret; | 4690 | int i, ret; |
4406 | struct hlist_node *tmp; | 4691 | struct hlist_node *tmp; |
4407 | struct css_set *cg; | 4692 | struct css_set *cset; |
4408 | unsigned long key; | 4693 | unsigned long key; |
4409 | 4694 | ||
4410 | /* check name and function validity */ | 4695 | /* check name and function validity */ |
@@ -4427,7 +4712,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4427 | */ | 4712 | */ |
4428 | if (ss->module == NULL) { | 4713 | if (ss->module == NULL) { |
4429 | /* a sanity check */ | 4714 | /* a sanity check */ |
4430 | BUG_ON(subsys[ss->subsys_id] != ss); | 4715 | BUG_ON(cgroup_subsys[ss->subsys_id] != ss); |
4431 | return 0; | 4716 | return 0; |
4432 | } | 4717 | } |
4433 | 4718 | ||
@@ -4435,26 +4720,26 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4435 | cgroup_init_cftsets(ss); | 4720 | cgroup_init_cftsets(ss); |
4436 | 4721 | ||
4437 | mutex_lock(&cgroup_mutex); | 4722 | mutex_lock(&cgroup_mutex); |
4438 | subsys[ss->subsys_id] = ss; | 4723 | cgroup_subsys[ss->subsys_id] = ss; |
4439 | 4724 | ||
4440 | /* | 4725 | /* |
4441 | * no ss->css_alloc seems to need anything important in the ss | 4726 | * no ss->css_alloc seems to need anything important in the ss |
4442 | * struct, so this can happen first (i.e. before the rootnode | 4727 | * struct, so this can happen first (i.e. before the dummy root |
4443 | * attachment). | 4728 | * attachment). |
4444 | */ | 4729 | */ |
4445 | css = ss->css_alloc(dummytop); | 4730 | css = ss->css_alloc(cgroup_dummy_top); |
4446 | if (IS_ERR(css)) { | 4731 | if (IS_ERR(css)) { |
4447 | /* failure case - need to deassign the subsys[] slot. */ | 4732 | /* failure case - need to deassign the cgroup_subsys[] slot. */ |
4448 | subsys[ss->subsys_id] = NULL; | 4733 | cgroup_subsys[ss->subsys_id] = NULL; |
4449 | mutex_unlock(&cgroup_mutex); | 4734 | mutex_unlock(&cgroup_mutex); |
4450 | return PTR_ERR(css); | 4735 | return PTR_ERR(css); |
4451 | } | 4736 | } |
4452 | 4737 | ||
4453 | list_add(&ss->sibling, &rootnode.subsys_list); | 4738 | list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); |
4454 | ss->root = &rootnode; | 4739 | ss->root = &cgroup_dummy_root; |
4455 | 4740 | ||
4456 | /* our new subsystem will be attached to the dummy hierarchy. */ | 4741 | /* our new subsystem will be attached to the dummy hierarchy. */ |
4457 | init_cgroup_css(css, ss, dummytop); | 4742 | init_cgroup_css(css, ss, cgroup_dummy_top); |
4458 | /* init_idr must be after init_cgroup_css because it sets css->id. */ | 4743 | /* init_idr must be after init_cgroup_css because it sets css->id. */ |
4459 | if (ss->use_id) { | 4744 | if (ss->use_id) { |
4460 | ret = cgroup_init_idr(ss, css); | 4745 | ret = cgroup_init_idr(ss, css); |
@@ -4471,21 +4756,21 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4471 | * this is all done under the css_set_lock. | 4756 | * this is all done under the css_set_lock. |
4472 | */ | 4757 | */ |
4473 | write_lock(&css_set_lock); | 4758 | write_lock(&css_set_lock); |
4474 | hash_for_each_safe(css_set_table, i, tmp, cg, hlist) { | 4759 | hash_for_each_safe(css_set_table, i, tmp, cset, hlist) { |
4475 | /* skip entries that we already rehashed */ | 4760 | /* skip entries that we already rehashed */ |
4476 | if (cg->subsys[ss->subsys_id]) | 4761 | if (cset->subsys[ss->subsys_id]) |
4477 | continue; | 4762 | continue; |
4478 | /* remove existing entry */ | 4763 | /* remove existing entry */ |
4479 | hash_del(&cg->hlist); | 4764 | hash_del(&cset->hlist); |
4480 | /* set new value */ | 4765 | /* set new value */ |
4481 | cg->subsys[ss->subsys_id] = css; | 4766 | cset->subsys[ss->subsys_id] = css; |
4482 | /* recompute hash and restore entry */ | 4767 | /* recompute hash and restore entry */ |
4483 | key = css_set_hash(cg->subsys); | 4768 | key = css_set_hash(cset->subsys); |
4484 | hash_add(css_set_table, &cg->hlist, key); | 4769 | hash_add(css_set_table, &cset->hlist, key); |
4485 | } | 4770 | } |
4486 | write_unlock(&css_set_lock); | 4771 | write_unlock(&css_set_lock); |
4487 | 4772 | ||
4488 | ret = online_css(ss, dummytop); | 4773 | ret = online_css(ss, cgroup_dummy_top); |
4489 | if (ret) | 4774 | if (ret) |
4490 | goto err_unload; | 4775 | goto err_unload; |
4491 | 4776 | ||
@@ -4511,7 +4796,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys); | |||
4511 | */ | 4796 | */ |
4512 | void cgroup_unload_subsys(struct cgroup_subsys *ss) | 4797 | void cgroup_unload_subsys(struct cgroup_subsys *ss) |
4513 | { | 4798 | { |
4514 | struct cg_cgroup_link *link; | 4799 | struct cgrp_cset_link *link; |
4515 | 4800 | ||
4516 | BUG_ON(ss->module == NULL); | 4801 | BUG_ON(ss->module == NULL); |
4517 | 4802 | ||
@@ -4520,45 +4805,46 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4520 | * try_module_get in parse_cgroupfs_options should ensure that it | 4805 | * try_module_get in parse_cgroupfs_options should ensure that it |
4521 | * doesn't start being used while we're killing it off. | 4806 | * doesn't start being used while we're killing it off. |
4522 | */ | 4807 | */ |
4523 | BUG_ON(ss->root != &rootnode); | 4808 | BUG_ON(ss->root != &cgroup_dummy_root); |
4524 | 4809 | ||
4525 | mutex_lock(&cgroup_mutex); | 4810 | mutex_lock(&cgroup_mutex); |
4526 | 4811 | ||
4527 | offline_css(ss, dummytop); | 4812 | offline_css(ss, cgroup_dummy_top); |
4528 | 4813 | ||
4529 | if (ss->use_id) | 4814 | if (ss->use_id) |
4530 | idr_destroy(&ss->idr); | 4815 | idr_destroy(&ss->idr); |
4531 | 4816 | ||
4532 | /* deassign the subsys_id */ | 4817 | /* deassign the subsys_id */ |
4533 | subsys[ss->subsys_id] = NULL; | 4818 | cgroup_subsys[ss->subsys_id] = NULL; |
4534 | 4819 | ||
4535 | /* remove subsystem from rootnode's list of subsystems */ | 4820 | /* remove subsystem from the dummy root's list of subsystems */ |
4536 | list_del_init(&ss->sibling); | 4821 | list_del_init(&ss->sibling); |
4537 | 4822 | ||
4538 | /* | 4823 | /* |
4539 | * disentangle the css from all css_sets attached to the dummytop. as | 4824 | * disentangle the css from all css_sets attached to the dummy |
4540 | * in loading, we need to pay our respects to the hashtable gods. | 4825 | * top. as in loading, we need to pay our respects to the hashtable |
4826 | * gods. | ||
4541 | */ | 4827 | */ |
4542 | write_lock(&css_set_lock); | 4828 | write_lock(&css_set_lock); |
4543 | list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { | 4829 | list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) { |
4544 | struct css_set *cg = link->cg; | 4830 | struct css_set *cset = link->cset; |
4545 | unsigned long key; | 4831 | unsigned long key; |
4546 | 4832 | ||
4547 | hash_del(&cg->hlist); | 4833 | hash_del(&cset->hlist); |
4548 | cg->subsys[ss->subsys_id] = NULL; | 4834 | cset->subsys[ss->subsys_id] = NULL; |
4549 | key = css_set_hash(cg->subsys); | 4835 | key = css_set_hash(cset->subsys); |
4550 | hash_add(css_set_table, &cg->hlist, key); | 4836 | hash_add(css_set_table, &cset->hlist, key); |
4551 | } | 4837 | } |
4552 | write_unlock(&css_set_lock); | 4838 | write_unlock(&css_set_lock); |
4553 | 4839 | ||
4554 | /* | 4840 | /* |
4555 | * remove subsystem's css from the dummytop and free it - need to | 4841 | * remove subsystem's css from the cgroup_dummy_top and free it - |
4556 | * free before marking as null because ss->css_free needs the | 4842 | * need to free before marking as null because ss->css_free needs |
4557 | * cgrp->subsys pointer to find their state. note that this also | 4843 | * the cgrp->subsys pointer to find their state. note that this |
4558 | * takes care of freeing the css_id. | 4844 | * also takes care of freeing the css_id. |
4559 | */ | 4845 | */ |
4560 | ss->css_free(dummytop); | 4846 | ss->css_free(cgroup_dummy_top); |
4561 | dummytop->subsys[ss->subsys_id] = NULL; | 4847 | cgroup_dummy_top->subsys[ss->subsys_id] = NULL; |
4562 | 4848 | ||
4563 | mutex_unlock(&cgroup_mutex); | 4849 | mutex_unlock(&cgroup_mutex); |
4564 | } | 4850 | } |
@@ -4572,30 +4858,25 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys); | |||
4572 | */ | 4858 | */ |
4573 | int __init cgroup_init_early(void) | 4859 | int __init cgroup_init_early(void) |
4574 | { | 4860 | { |
4861 | struct cgroup_subsys *ss; | ||
4575 | int i; | 4862 | int i; |
4863 | |||
4576 | atomic_set(&init_css_set.refcount, 1); | 4864 | atomic_set(&init_css_set.refcount, 1); |
4577 | INIT_LIST_HEAD(&init_css_set.cg_links); | 4865 | INIT_LIST_HEAD(&init_css_set.cgrp_links); |
4578 | INIT_LIST_HEAD(&init_css_set.tasks); | 4866 | INIT_LIST_HEAD(&init_css_set.tasks); |
4579 | INIT_HLIST_NODE(&init_css_set.hlist); | 4867 | INIT_HLIST_NODE(&init_css_set.hlist); |
4580 | css_set_count = 1; | 4868 | css_set_count = 1; |
4581 | init_cgroup_root(&rootnode); | 4869 | init_cgroup_root(&cgroup_dummy_root); |
4582 | root_count = 1; | 4870 | cgroup_root_count = 1; |
4583 | init_task.cgroups = &init_css_set; | 4871 | RCU_INIT_POINTER(init_task.cgroups, &init_css_set); |
4584 | 4872 | ||
4585 | init_css_set_link.cg = &init_css_set; | 4873 | init_cgrp_cset_link.cset = &init_css_set; |
4586 | init_css_set_link.cgrp = dummytop; | 4874 | init_cgrp_cset_link.cgrp = cgroup_dummy_top; |
4587 | list_add(&init_css_set_link.cgrp_link_list, | 4875 | list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); |
4588 | &rootnode.top_cgroup.css_sets); | 4876 | list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); |
4589 | list_add(&init_css_set_link.cg_link_list, | ||
4590 | &init_css_set.cg_links); | ||
4591 | |||
4592 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
4593 | struct cgroup_subsys *ss = subsys[i]; | ||
4594 | |||
4595 | /* at bootup time, we don't worry about modular subsystems */ | ||
4596 | if (!ss || ss->module) | ||
4597 | continue; | ||
4598 | 4877 | ||
4878 | /* at bootup time, we don't worry about modular subsystems */ | ||
4879 | for_each_builtin_subsys(ss, i) { | ||
4599 | BUG_ON(!ss->name); | 4880 | BUG_ON(!ss->name); |
4600 | BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); | 4881 | BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); |
4601 | BUG_ON(!ss->css_alloc); | 4882 | BUG_ON(!ss->css_alloc); |
@@ -4620,30 +4901,33 @@ int __init cgroup_init_early(void) | |||
4620 | */ | 4901 | */ |
4621 | int __init cgroup_init(void) | 4902 | int __init cgroup_init(void) |
4622 | { | 4903 | { |
4623 | int err; | 4904 | struct cgroup_subsys *ss; |
4624 | int i; | ||
4625 | unsigned long key; | 4905 | unsigned long key; |
4906 | int i, err; | ||
4626 | 4907 | ||
4627 | err = bdi_init(&cgroup_backing_dev_info); | 4908 | err = bdi_init(&cgroup_backing_dev_info); |
4628 | if (err) | 4909 | if (err) |
4629 | return err; | 4910 | return err; |
4630 | 4911 | ||
4631 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 4912 | for_each_builtin_subsys(ss, i) { |
4632 | struct cgroup_subsys *ss = subsys[i]; | ||
4633 | |||
4634 | /* at bootup time, we don't worry about modular subsystems */ | ||
4635 | if (!ss || ss->module) | ||
4636 | continue; | ||
4637 | if (!ss->early_init) | 4913 | if (!ss->early_init) |
4638 | cgroup_init_subsys(ss); | 4914 | cgroup_init_subsys(ss); |
4639 | if (ss->use_id) | 4915 | if (ss->use_id) |
4640 | cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); | 4916 | cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); |
4641 | } | 4917 | } |
4642 | 4918 | ||
4919 | /* allocate id for the dummy hierarchy */ | ||
4920 | mutex_lock(&cgroup_mutex); | ||
4921 | mutex_lock(&cgroup_root_mutex); | ||
4922 | |||
4643 | /* Add init_css_set to the hash table */ | 4923 | /* Add init_css_set to the hash table */ |
4644 | key = css_set_hash(init_css_set.subsys); | 4924 | key = css_set_hash(init_css_set.subsys); |
4645 | hash_add(css_set_table, &init_css_set.hlist, key); | 4925 | hash_add(css_set_table, &init_css_set.hlist, key); |
4646 | BUG_ON(!init_root_id(&rootnode)); | 4926 | |
4927 | BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); | ||
4928 | |||
4929 | mutex_unlock(&cgroup_root_mutex); | ||
4930 | mutex_unlock(&cgroup_mutex); | ||
4647 | 4931 | ||
4648 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); | 4932 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); |
4649 | if (!cgroup_kobj) { | 4933 | if (!cgroup_kobj) { |
@@ -4708,7 +4992,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) | |||
4708 | int count = 0; | 4992 | int count = 0; |
4709 | 4993 | ||
4710 | seq_printf(m, "%d:", root->hierarchy_id); | 4994 | seq_printf(m, "%d:", root->hierarchy_id); |
4711 | for_each_subsys(root, ss) | 4995 | for_each_root_subsys(root, ss) |
4712 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); | 4996 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); |
4713 | if (strlen(root->name)) | 4997 | if (strlen(root->name)) |
4714 | seq_printf(m, "%sname=%s", count ? "," : "", | 4998 | seq_printf(m, "%sname=%s", count ? "," : "", |
@@ -4734,6 +5018,7 @@ out: | |||
4734 | /* Display information about each subsystem and each hierarchy */ | 5018 | /* Display information about each subsystem and each hierarchy */ |
4735 | static int proc_cgroupstats_show(struct seq_file *m, void *v) | 5019 | static int proc_cgroupstats_show(struct seq_file *m, void *v) |
4736 | { | 5020 | { |
5021 | struct cgroup_subsys *ss; | ||
4737 | int i; | 5022 | int i; |
4738 | 5023 | ||
4739 | seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); | 5024 | seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); |
@@ -4743,14 +5028,12 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) | |||
4743 | * subsys/hierarchy state. | 5028 | * subsys/hierarchy state. |
4744 | */ | 5029 | */ |
4745 | mutex_lock(&cgroup_mutex); | 5030 | mutex_lock(&cgroup_mutex); |
4746 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 5031 | |
4747 | struct cgroup_subsys *ss = subsys[i]; | 5032 | for_each_subsys(ss, i) |
4748 | if (ss == NULL) | ||
4749 | continue; | ||
4750 | seq_printf(m, "%s\t%d\t%d\t%d\n", | 5033 | seq_printf(m, "%s\t%d\t%d\t%d\n", |
4751 | ss->name, ss->root->hierarchy_id, | 5034 | ss->name, ss->root->hierarchy_id, |
4752 | ss->root->number_of_cgroups, !ss->disabled); | 5035 | ss->root->number_of_cgroups, !ss->disabled); |
4753 | } | 5036 | |
4754 | mutex_unlock(&cgroup_mutex); | 5037 | mutex_unlock(&cgroup_mutex); |
4755 | return 0; | 5038 | return 0; |
4756 | } | 5039 | } |
@@ -4786,8 +5069,8 @@ static const struct file_operations proc_cgroupstats_operations = { | |||
4786 | void cgroup_fork(struct task_struct *child) | 5069 | void cgroup_fork(struct task_struct *child) |
4787 | { | 5070 | { |
4788 | task_lock(current); | 5071 | task_lock(current); |
5072 | get_css_set(task_css_set(current)); | ||
4789 | child->cgroups = current->cgroups; | 5073 | child->cgroups = current->cgroups; |
4790 | get_css_set(child->cgroups); | ||
4791 | task_unlock(current); | 5074 | task_unlock(current); |
4792 | INIT_LIST_HEAD(&child->cg_list); | 5075 | INIT_LIST_HEAD(&child->cg_list); |
4793 | } | 5076 | } |
@@ -4804,6 +5087,7 @@ void cgroup_fork(struct task_struct *child) | |||
4804 | */ | 5087 | */ |
4805 | void cgroup_post_fork(struct task_struct *child) | 5088 | void cgroup_post_fork(struct task_struct *child) |
4806 | { | 5089 | { |
5090 | struct cgroup_subsys *ss; | ||
4807 | int i; | 5091 | int i; |
4808 | 5092 | ||
4809 | /* | 5093 | /* |
@@ -4821,7 +5105,7 @@ void cgroup_post_fork(struct task_struct *child) | |||
4821 | write_lock(&css_set_lock); | 5105 | write_lock(&css_set_lock); |
4822 | task_lock(child); | 5106 | task_lock(child); |
4823 | if (list_empty(&child->cg_list)) | 5107 | if (list_empty(&child->cg_list)) |
4824 | list_add(&child->cg_list, &child->cgroups->tasks); | 5108 | list_add(&child->cg_list, &task_css_set(child)->tasks); |
4825 | task_unlock(child); | 5109 | task_unlock(child); |
4826 | write_unlock(&css_set_lock); | 5110 | write_unlock(&css_set_lock); |
4827 | } | 5111 | } |
@@ -4840,12 +5124,9 @@ void cgroup_post_fork(struct task_struct *child) | |||
4840 | * of the array can be freed at module unload, so we | 5124 | * of the array can be freed at module unload, so we |
4841 | * can't touch that. | 5125 | * can't touch that. |
4842 | */ | 5126 | */ |
4843 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | 5127 | for_each_builtin_subsys(ss, i) |
4844 | struct cgroup_subsys *ss = subsys[i]; | ||
4845 | |||
4846 | if (ss->fork) | 5128 | if (ss->fork) |
4847 | ss->fork(child); | 5129 | ss->fork(child); |
4848 | } | ||
4849 | } | 5130 | } |
4850 | } | 5131 | } |
4851 | 5132 | ||
@@ -4886,7 +5167,8 @@ void cgroup_post_fork(struct task_struct *child) | |||
4886 | */ | 5167 | */ |
4887 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) | 5168 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) |
4888 | { | 5169 | { |
4889 | struct css_set *cg; | 5170 | struct cgroup_subsys *ss; |
5171 | struct css_set *cset; | ||
4890 | int i; | 5172 | int i; |
4891 | 5173 | ||
4892 | /* | 5174 | /* |
@@ -4903,36 +5185,32 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
4903 | 5185 | ||
4904 | /* Reassign the task to the init_css_set. */ | 5186 | /* Reassign the task to the init_css_set. */ |
4905 | task_lock(tsk); | 5187 | task_lock(tsk); |
4906 | cg = tsk->cgroups; | 5188 | cset = task_css_set(tsk); |
4907 | tsk->cgroups = &init_css_set; | 5189 | RCU_INIT_POINTER(tsk->cgroups, &init_css_set); |
4908 | 5190 | ||
4909 | if (run_callbacks && need_forkexit_callback) { | 5191 | if (run_callbacks && need_forkexit_callback) { |
4910 | /* | 5192 | /* |
4911 | * fork/exit callbacks are supported only for builtin | 5193 | * fork/exit callbacks are supported only for builtin |
4912 | * subsystems, see cgroup_post_fork() for details. | 5194 | * subsystems, see cgroup_post_fork() for details. |
4913 | */ | 5195 | */ |
4914 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | 5196 | for_each_builtin_subsys(ss, i) { |
4915 | struct cgroup_subsys *ss = subsys[i]; | ||
4916 | |||
4917 | if (ss->exit) { | 5197 | if (ss->exit) { |
4918 | struct cgroup *old_cgrp = | 5198 | struct cgroup *old_cgrp = cset->subsys[i]->cgroup; |
4919 | rcu_dereference_raw(cg->subsys[i])->cgroup; | ||
4920 | struct cgroup *cgrp = task_cgroup(tsk, i); | 5199 | struct cgroup *cgrp = task_cgroup(tsk, i); |
5200 | |||
4921 | ss->exit(cgrp, old_cgrp, tsk); | 5201 | ss->exit(cgrp, old_cgrp, tsk); |
4922 | } | 5202 | } |
4923 | } | 5203 | } |
4924 | } | 5204 | } |
4925 | task_unlock(tsk); | 5205 | task_unlock(tsk); |
4926 | 5206 | ||
4927 | put_css_set_taskexit(cg); | 5207 | put_css_set_taskexit(cset); |
4928 | } | 5208 | } |
4929 | 5209 | ||
4930 | static void check_for_release(struct cgroup *cgrp) | 5210 | static void check_for_release(struct cgroup *cgrp) |
4931 | { | 5211 | { |
4932 | /* All of these checks rely on RCU to keep the cgroup | ||
4933 | * structure alive */ | ||
4934 | if (cgroup_is_releasable(cgrp) && | 5212 | if (cgroup_is_releasable(cgrp) && |
4935 | !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) { | 5213 | list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { |
4936 | /* | 5214 | /* |
4937 | * Control Group is currently removeable. If it's not | 5215 | * Control Group is currently removeable. If it's not |
4938 | * already queued for a userspace notification, queue | 5216 | * already queued for a userspace notification, queue |
@@ -4941,7 +5219,7 @@ static void check_for_release(struct cgroup *cgrp) | |||
4941 | int need_schedule_work = 0; | 5219 | int need_schedule_work = 0; |
4942 | 5220 | ||
4943 | raw_spin_lock(&release_list_lock); | 5221 | raw_spin_lock(&release_list_lock); |
4944 | if (!cgroup_is_removed(cgrp) && | 5222 | if (!cgroup_is_dead(cgrp) && |
4945 | list_empty(&cgrp->release_list)) { | 5223 | list_empty(&cgrp->release_list)) { |
4946 | list_add(&cgrp->release_list, &release_list); | 5224 | list_add(&cgrp->release_list, &release_list); |
4947 | need_schedule_work = 1; | 5225 | need_schedule_work = 1; |
@@ -4952,34 +5230,6 @@ static void check_for_release(struct cgroup *cgrp) | |||
4952 | } | 5230 | } |
4953 | } | 5231 | } |
4954 | 5232 | ||
4955 | /* Caller must verify that the css is not for root cgroup */ | ||
4956 | bool __css_tryget(struct cgroup_subsys_state *css) | ||
4957 | { | ||
4958 | while (true) { | ||
4959 | int t, v; | ||
4960 | |||
4961 | v = css_refcnt(css); | ||
4962 | t = atomic_cmpxchg(&css->refcnt, v, v + 1); | ||
4963 | if (likely(t == v)) | ||
4964 | return true; | ||
4965 | else if (t < 0) | ||
4966 | return false; | ||
4967 | cpu_relax(); | ||
4968 | } | ||
4969 | } | ||
4970 | EXPORT_SYMBOL_GPL(__css_tryget); | ||
4971 | |||
4972 | /* Caller must verify that the css is not for root cgroup */ | ||
4973 | void __css_put(struct cgroup_subsys_state *css) | ||
4974 | { | ||
4975 | int v; | ||
4976 | |||
4977 | v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); | ||
4978 | if (v == 0) | ||
4979 | schedule_work(&css->dput_work); | ||
4980 | } | ||
4981 | EXPORT_SYMBOL_GPL(__css_put); | ||
4982 | |||
4983 | /* | 5233 | /* |
4984 | * Notify userspace when a cgroup is released, by running the | 5234 | * Notify userspace when a cgroup is released, by running the |
4985 | * configured release agent with the name of the cgroup (path | 5235 | * configured release agent with the name of the cgroup (path |
@@ -5054,23 +5304,19 @@ static void cgroup_release_agent(struct work_struct *work) | |||
5054 | 5304 | ||
5055 | static int __init cgroup_disable(char *str) | 5305 | static int __init cgroup_disable(char *str) |
5056 | { | 5306 | { |
5057 | int i; | 5307 | struct cgroup_subsys *ss; |
5058 | char *token; | 5308 | char *token; |
5309 | int i; | ||
5059 | 5310 | ||
5060 | while ((token = strsep(&str, ",")) != NULL) { | 5311 | while ((token = strsep(&str, ",")) != NULL) { |
5061 | if (!*token) | 5312 | if (!*token) |
5062 | continue; | 5313 | continue; |
5063 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | ||
5064 | struct cgroup_subsys *ss = subsys[i]; | ||
5065 | |||
5066 | /* | ||
5067 | * cgroup_disable, being at boot time, can't | ||
5068 | * know about module subsystems, so we don't | ||
5069 | * worry about them. | ||
5070 | */ | ||
5071 | if (!ss || ss->module) | ||
5072 | continue; | ||
5073 | 5314 | ||
5315 | /* | ||
5316 | * cgroup_disable, being at boot time, can't know about | ||
5317 | * module subsystems, so we don't worry about them. | ||
5318 | */ | ||
5319 | for_each_builtin_subsys(ss, i) { | ||
5074 | if (!strcmp(token, ss->name)) { | 5320 | if (!strcmp(token, ss->name)) { |
5075 | ss->disabled = 1; | 5321 | ss->disabled = 1; |
5076 | printk(KERN_INFO "Disabling %s control group" | 5322 | printk(KERN_INFO "Disabling %s control group" |
@@ -5087,9 +5333,7 @@ __setup("cgroup_disable=", cgroup_disable); | |||
5087 | * Functons for CSS ID. | 5333 | * Functons for CSS ID. |
5088 | */ | 5334 | */ |
5089 | 5335 | ||
5090 | /* | 5336 | /* to get ID other than 0, this should be called when !cgroup_is_dead() */ |
5091 | *To get ID other than 0, this should be called when !cgroup_is_removed(). | ||
5092 | */ | ||
5093 | unsigned short css_id(struct cgroup_subsys_state *css) | 5337 | unsigned short css_id(struct cgroup_subsys_state *css) |
5094 | { | 5338 | { |
5095 | struct css_id *cssid; | 5339 | struct css_id *cssid; |
@@ -5099,7 +5343,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) | |||
5099 | * on this or this is under rcu_read_lock(). Once css->id is allocated, | 5343 | * on this or this is under rcu_read_lock(). Once css->id is allocated, |
5100 | * it's unchanged until freed. | 5344 | * it's unchanged until freed. |
5101 | */ | 5345 | */ |
5102 | cssid = rcu_dereference_check(css->id, css_refcnt(css)); | 5346 | cssid = rcu_dereference_raw(css->id); |
5103 | 5347 | ||
5104 | if (cssid) | 5348 | if (cssid) |
5105 | return cssid->id; | 5349 | return cssid->id; |
@@ -5107,18 +5351,6 @@ unsigned short css_id(struct cgroup_subsys_state *css) | |||
5107 | } | 5351 | } |
5108 | EXPORT_SYMBOL_GPL(css_id); | 5352 | EXPORT_SYMBOL_GPL(css_id); |
5109 | 5353 | ||
5110 | unsigned short css_depth(struct cgroup_subsys_state *css) | ||
5111 | { | ||
5112 | struct css_id *cssid; | ||
5113 | |||
5114 | cssid = rcu_dereference_check(css->id, css_refcnt(css)); | ||
5115 | |||
5116 | if (cssid) | ||
5117 | return cssid->depth; | ||
5118 | return 0; | ||
5119 | } | ||
5120 | EXPORT_SYMBOL_GPL(css_depth); | ||
5121 | |||
5122 | /** | 5354 | /** |
5123 | * css_is_ancestor - test "root" css is an ancestor of "child" | 5355 | * css_is_ancestor - test "root" css is an ancestor of "child" |
5124 | * @child: the css to be tested. | 5356 | * @child: the css to be tested. |
@@ -5153,7 +5385,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *child, | |||
5153 | 5385 | ||
5154 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | 5386 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) |
5155 | { | 5387 | { |
5156 | struct css_id *id = css->id; | 5388 | struct css_id *id = rcu_dereference_protected(css->id, true); |
5389 | |||
5157 | /* When this is called before css_id initialization, id can be NULL */ | 5390 | /* When this is called before css_id initialization, id can be NULL */ |
5158 | if (!id) | 5391 | if (!id) |
5159 | return; | 5392 | return; |
@@ -5219,8 +5452,8 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, | |||
5219 | return PTR_ERR(newid); | 5452 | return PTR_ERR(newid); |
5220 | 5453 | ||
5221 | newid->stack[0] = newid->id; | 5454 | newid->stack[0] = newid->id; |
5222 | newid->css = rootcss; | 5455 | RCU_INIT_POINTER(newid->css, rootcss); |
5223 | rootcss->id = newid; | 5456 | RCU_INIT_POINTER(rootcss->id, newid); |
5224 | return 0; | 5457 | return 0; |
5225 | } | 5458 | } |
5226 | 5459 | ||
@@ -5234,7 +5467,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, | |||
5234 | subsys_id = ss->subsys_id; | 5467 | subsys_id = ss->subsys_id; |
5235 | parent_css = parent->subsys[subsys_id]; | 5468 | parent_css = parent->subsys[subsys_id]; |
5236 | child_css = child->subsys[subsys_id]; | 5469 | child_css = child->subsys[subsys_id]; |
5237 | parent_id = parent_css->id; | 5470 | parent_id = rcu_dereference_protected(parent_css->id, true); |
5238 | depth = parent_id->depth + 1; | 5471 | depth = parent_id->depth + 1; |
5239 | 5472 | ||
5240 | child_id = get_new_cssid(ss, depth); | 5473 | child_id = get_new_cssid(ss, depth); |
@@ -5299,7 +5532,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | |||
5299 | } | 5532 | } |
5300 | 5533 | ||
5301 | #ifdef CONFIG_CGROUP_DEBUG | 5534 | #ifdef CONFIG_CGROUP_DEBUG |
5302 | static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) | 5535 | static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) |
5303 | { | 5536 | { |
5304 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); | 5537 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); |
5305 | 5538 | ||
@@ -5309,48 +5542,43 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) | |||
5309 | return css; | 5542 | return css; |
5310 | } | 5543 | } |
5311 | 5544 | ||
5312 | static void debug_css_free(struct cgroup *cont) | 5545 | static void debug_css_free(struct cgroup *cgrp) |
5313 | { | ||
5314 | kfree(cont->subsys[debug_subsys_id]); | ||
5315 | } | ||
5316 | |||
5317 | static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) | ||
5318 | { | 5546 | { |
5319 | return atomic_read(&cont->count); | 5547 | kfree(cgrp->subsys[debug_subsys_id]); |
5320 | } | 5548 | } |
5321 | 5549 | ||
5322 | static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) | 5550 | static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) |
5323 | { | 5551 | { |
5324 | return cgroup_task_count(cont); | 5552 | return cgroup_task_count(cgrp); |
5325 | } | 5553 | } |
5326 | 5554 | ||
5327 | static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) | 5555 | static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) |
5328 | { | 5556 | { |
5329 | return (u64)(unsigned long)current->cgroups; | 5557 | return (u64)(unsigned long)current->cgroups; |
5330 | } | 5558 | } |
5331 | 5559 | ||
5332 | static u64 current_css_set_refcount_read(struct cgroup *cont, | 5560 | static u64 current_css_set_refcount_read(struct cgroup *cgrp, |
5333 | struct cftype *cft) | 5561 | struct cftype *cft) |
5334 | { | 5562 | { |
5335 | u64 count; | 5563 | u64 count; |
5336 | 5564 | ||
5337 | rcu_read_lock(); | 5565 | rcu_read_lock(); |
5338 | count = atomic_read(¤t->cgroups->refcount); | 5566 | count = atomic_read(&task_css_set(current)->refcount); |
5339 | rcu_read_unlock(); | 5567 | rcu_read_unlock(); |
5340 | return count; | 5568 | return count; |
5341 | } | 5569 | } |
5342 | 5570 | ||
5343 | static int current_css_set_cg_links_read(struct cgroup *cont, | 5571 | static int current_css_set_cg_links_read(struct cgroup *cgrp, |
5344 | struct cftype *cft, | 5572 | struct cftype *cft, |
5345 | struct seq_file *seq) | 5573 | struct seq_file *seq) |
5346 | { | 5574 | { |
5347 | struct cg_cgroup_link *link; | 5575 | struct cgrp_cset_link *link; |
5348 | struct css_set *cg; | 5576 | struct css_set *cset; |
5349 | 5577 | ||
5350 | read_lock(&css_set_lock); | 5578 | read_lock(&css_set_lock); |
5351 | rcu_read_lock(); | 5579 | rcu_read_lock(); |
5352 | cg = rcu_dereference(current->cgroups); | 5580 | cset = rcu_dereference(current->cgroups); |
5353 | list_for_each_entry(link, &cg->cg_links, cg_link_list) { | 5581 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { |
5354 | struct cgroup *c = link->cgrp; | 5582 | struct cgroup *c = link->cgrp; |
5355 | const char *name; | 5583 | const char *name; |
5356 | 5584 | ||
@@ -5367,19 +5595,19 @@ static int current_css_set_cg_links_read(struct cgroup *cont, | |||
5367 | } | 5595 | } |
5368 | 5596 | ||
5369 | #define MAX_TASKS_SHOWN_PER_CSS 25 | 5597 | #define MAX_TASKS_SHOWN_PER_CSS 25 |
5370 | static int cgroup_css_links_read(struct cgroup *cont, | 5598 | static int cgroup_css_links_read(struct cgroup *cgrp, |
5371 | struct cftype *cft, | 5599 | struct cftype *cft, |
5372 | struct seq_file *seq) | 5600 | struct seq_file *seq) |
5373 | { | 5601 | { |
5374 | struct cg_cgroup_link *link; | 5602 | struct cgrp_cset_link *link; |
5375 | 5603 | ||
5376 | read_lock(&css_set_lock); | 5604 | read_lock(&css_set_lock); |
5377 | list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { | 5605 | list_for_each_entry(link, &cgrp->cset_links, cset_link) { |
5378 | struct css_set *cg = link->cg; | 5606 | struct css_set *cset = link->cset; |
5379 | struct task_struct *task; | 5607 | struct task_struct *task; |
5380 | int count = 0; | 5608 | int count = 0; |
5381 | seq_printf(seq, "css_set %p\n", cg); | 5609 | seq_printf(seq, "css_set %p\n", cset); |
5382 | list_for_each_entry(task, &cg->tasks, cg_list) { | 5610 | list_for_each_entry(task, &cset->tasks, cg_list) { |
5383 | if (count++ > MAX_TASKS_SHOWN_PER_CSS) { | 5611 | if (count++ > MAX_TASKS_SHOWN_PER_CSS) { |
5384 | seq_puts(seq, " ...\n"); | 5612 | seq_puts(seq, " ...\n"); |
5385 | break; | 5613 | break; |
@@ -5400,10 +5628,6 @@ static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) | |||
5400 | 5628 | ||
5401 | static struct cftype debug_files[] = { | 5629 | static struct cftype debug_files[] = { |
5402 | { | 5630 | { |
5403 | .name = "cgroup_refcount", | ||
5404 | .read_u64 = cgroup_refcount_read, | ||
5405 | }, | ||
5406 | { | ||
5407 | .name = "taskcount", | 5631 | .name = "taskcount", |
5408 | .read_u64 = debug_taskcount_read, | 5632 | .read_u64 = debug_taskcount_read, |
5409 | }, | 5633 | }, |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 902d13fc2b13..e5657788fedd 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -59,6 +59,7 @@ | |||
59 | #include <linux/mutex.h> | 59 | #include <linux/mutex.h> |
60 | #include <linux/workqueue.h> | 60 | #include <linux/workqueue.h> |
61 | #include <linux/cgroup.h> | 61 | #include <linux/cgroup.h> |
62 | #include <linux/wait.h> | ||
62 | 63 | ||
63 | /* | 64 | /* |
64 | * Tracks how many cpusets are currently defined in system. | 65 | * Tracks how many cpusets are currently defined in system. |
@@ -87,6 +88,18 @@ struct cpuset { | |||
87 | cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ | 88 | cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ |
88 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ | 89 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ |
89 | 90 | ||
91 | /* | ||
92 | * This is old Memory Nodes tasks took on. | ||
93 | * | ||
94 | * - top_cpuset.old_mems_allowed is initialized to mems_allowed. | ||
95 | * - A new cpuset's old_mems_allowed is initialized when some | ||
96 | * task is moved into it. | ||
97 | * - old_mems_allowed is used in cpuset_migrate_mm() when we change | ||
98 | * cpuset.mems_allowed and have tasks' nodemask updated, and | ||
99 | * then old_mems_allowed is updated to mems_allowed. | ||
100 | */ | ||
101 | nodemask_t old_mems_allowed; | ||
102 | |||
90 | struct fmeter fmeter; /* memory_pressure filter */ | 103 | struct fmeter fmeter; /* memory_pressure filter */ |
91 | 104 | ||
92 | /* | 105 | /* |
@@ -100,14 +113,12 @@ struct cpuset { | |||
100 | 113 | ||
101 | /* for custom sched domain */ | 114 | /* for custom sched domain */ |
102 | int relax_domain_level; | 115 | int relax_domain_level; |
103 | |||
104 | struct work_struct hotplug_work; | ||
105 | }; | 116 | }; |
106 | 117 | ||
107 | /* Retrieve the cpuset for a cgroup */ | 118 | /* Retrieve the cpuset for a cgroup */ |
108 | static inline struct cpuset *cgroup_cs(struct cgroup *cont) | 119 | static inline struct cpuset *cgroup_cs(struct cgroup *cgrp) |
109 | { | 120 | { |
110 | return container_of(cgroup_subsys_state(cont, cpuset_subsys_id), | 121 | return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id), |
111 | struct cpuset, css); | 122 | struct cpuset, css); |
112 | } | 123 | } |
113 | 124 | ||
@@ -267,14 +278,11 @@ static DEFINE_MUTEX(callback_mutex); | |||
267 | /* | 278 | /* |
268 | * CPU / memory hotplug is handled asynchronously. | 279 | * CPU / memory hotplug is handled asynchronously. |
269 | */ | 280 | */ |
270 | static struct workqueue_struct *cpuset_propagate_hotplug_wq; | ||
271 | |||
272 | static void cpuset_hotplug_workfn(struct work_struct *work); | 281 | static void cpuset_hotplug_workfn(struct work_struct *work); |
273 | static void cpuset_propagate_hotplug_workfn(struct work_struct *work); | ||
274 | static void schedule_cpuset_propagate_hotplug(struct cpuset *cs); | ||
275 | |||
276 | static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); | 282 | static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); |
277 | 283 | ||
284 | static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); | ||
285 | |||
278 | /* | 286 | /* |
279 | * This is ugly, but preserves the userspace API for existing cpuset | 287 | * This is ugly, but preserves the userspace API for existing cpuset |
280 | * users. If someone tries to mount the "cpuset" filesystem, we | 288 | * users. If someone tries to mount the "cpuset" filesystem, we |
@@ -304,53 +312,38 @@ static struct file_system_type cpuset_fs_type = { | |||
304 | /* | 312 | /* |
305 | * Return in pmask the portion of a cpusets's cpus_allowed that | 313 | * Return in pmask the portion of a cpusets's cpus_allowed that |
306 | * are online. If none are online, walk up the cpuset hierarchy | 314 | * are online. If none are online, walk up the cpuset hierarchy |
307 | * until we find one that does have some online cpus. If we get | 315 | * until we find one that does have some online cpus. The top |
308 | * all the way to the top and still haven't found any online cpus, | 316 | * cpuset always has some cpus online. |
309 | * return cpu_online_mask. Or if passed a NULL cs from an exit'ing | ||
310 | * task, return cpu_online_mask. | ||
311 | * | 317 | * |
312 | * One way or another, we guarantee to return some non-empty subset | 318 | * One way or another, we guarantee to return some non-empty subset |
313 | * of cpu_online_mask. | 319 | * of cpu_online_mask. |
314 | * | 320 | * |
315 | * Call with callback_mutex held. | 321 | * Call with callback_mutex held. |
316 | */ | 322 | */ |
317 | |||
318 | static void guarantee_online_cpus(const struct cpuset *cs, | 323 | static void guarantee_online_cpus(const struct cpuset *cs, |
319 | struct cpumask *pmask) | 324 | struct cpumask *pmask) |
320 | { | 325 | { |
321 | while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) | 326 | while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) |
322 | cs = parent_cs(cs); | 327 | cs = parent_cs(cs); |
323 | if (cs) | 328 | cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); |
324 | cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); | ||
325 | else | ||
326 | cpumask_copy(pmask, cpu_online_mask); | ||
327 | BUG_ON(!cpumask_intersects(pmask, cpu_online_mask)); | ||
328 | } | 329 | } |
329 | 330 | ||
330 | /* | 331 | /* |
331 | * Return in *pmask the portion of a cpusets's mems_allowed that | 332 | * Return in *pmask the portion of a cpusets's mems_allowed that |
332 | * are online, with memory. If none are online with memory, walk | 333 | * are online, with memory. If none are online with memory, walk |
333 | * up the cpuset hierarchy until we find one that does have some | 334 | * up the cpuset hierarchy until we find one that does have some |
334 | * online mems. If we get all the way to the top and still haven't | 335 | * online mems. The top cpuset always has some mems online. |
335 | * found any online mems, return node_states[N_MEMORY]. | ||
336 | * | 336 | * |
337 | * One way or another, we guarantee to return some non-empty subset | 337 | * One way or another, we guarantee to return some non-empty subset |
338 | * of node_states[N_MEMORY]. | 338 | * of node_states[N_MEMORY]. |
339 | * | 339 | * |
340 | * Call with callback_mutex held. | 340 | * Call with callback_mutex held. |
341 | */ | 341 | */ |
342 | |||
343 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | 342 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) |
344 | { | 343 | { |
345 | while (cs && !nodes_intersects(cs->mems_allowed, | 344 | while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) |
346 | node_states[N_MEMORY])) | ||
347 | cs = parent_cs(cs); | 345 | cs = parent_cs(cs); |
348 | if (cs) | 346 | nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); |
349 | nodes_and(*pmask, cs->mems_allowed, | ||
350 | node_states[N_MEMORY]); | ||
351 | else | ||
352 | *pmask = node_states[N_MEMORY]; | ||
353 | BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY])); | ||
354 | } | 347 | } |
355 | 348 | ||
356 | /* | 349 | /* |
@@ -440,7 +433,7 @@ static void free_trial_cpuset(struct cpuset *trial) | |||
440 | 433 | ||
441 | static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | 434 | static int validate_change(const struct cpuset *cur, const struct cpuset *trial) |
442 | { | 435 | { |
443 | struct cgroup *cont; | 436 | struct cgroup *cgrp; |
444 | struct cpuset *c, *par; | 437 | struct cpuset *c, *par; |
445 | int ret; | 438 | int ret; |
446 | 439 | ||
@@ -448,7 +441,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
448 | 441 | ||
449 | /* Each of our child cpusets must be a subset of us */ | 442 | /* Each of our child cpusets must be a subset of us */ |
450 | ret = -EBUSY; | 443 | ret = -EBUSY; |
451 | cpuset_for_each_child(c, cont, cur) | 444 | cpuset_for_each_child(c, cgrp, cur) |
452 | if (!is_cpuset_subset(c, trial)) | 445 | if (!is_cpuset_subset(c, trial)) |
453 | goto out; | 446 | goto out; |
454 | 447 | ||
@@ -469,7 +462,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
469 | * overlap | 462 | * overlap |
470 | */ | 463 | */ |
471 | ret = -EINVAL; | 464 | ret = -EINVAL; |
472 | cpuset_for_each_child(c, cont, par) { | 465 | cpuset_for_each_child(c, cgrp, par) { |
473 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && | 466 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && |
474 | c != cur && | 467 | c != cur && |
475 | cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) | 468 | cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) |
@@ -486,7 +479,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
486 | */ | 479 | */ |
487 | ret = -ENOSPC; | 480 | ret = -ENOSPC; |
488 | if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && | 481 | if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && |
489 | (cpumask_empty(trial->cpus_allowed) || | 482 | (cpumask_empty(trial->cpus_allowed) && |
490 | nodes_empty(trial->mems_allowed))) | 483 | nodes_empty(trial->mems_allowed))) |
491 | goto out; | 484 | goto out; |
492 | 485 | ||
@@ -798,21 +791,43 @@ void rebuild_sched_domains(void) | |||
798 | mutex_unlock(&cpuset_mutex); | 791 | mutex_unlock(&cpuset_mutex); |
799 | } | 792 | } |
800 | 793 | ||
801 | /** | 794 | /* |
802 | * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's | 795 | * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus |
803 | * @tsk: task to test | 796 | * @cs: the cpuset in interest |
804 | * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner | ||
805 | * | 797 | * |
806 | * Call with cpuset_mutex held. May take callback_mutex during call. | 798 | * A cpuset's effective cpumask is the cpumask of the nearest ancestor |
807 | * Called for each task in a cgroup by cgroup_scan_tasks(). | 799 | * with non-empty cpus. We use effective cpumask whenever: |
808 | * Return nonzero if this tasks's cpus_allowed mask should be changed (in other | 800 | * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask |
809 | * words, if its mask is not equal to its cpuset's mask). | 801 | * if the cpuset they reside in has no cpus) |
802 | * - we want to retrieve task_cs(tsk)'s cpus_allowed. | ||
803 | * | ||
804 | * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an | ||
805 | * exception. See comments there. | ||
810 | */ | 806 | */ |
811 | static int cpuset_test_cpumask(struct task_struct *tsk, | 807 | static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs) |
812 | struct cgroup_scanner *scan) | ||
813 | { | 808 | { |
814 | return !cpumask_equal(&tsk->cpus_allowed, | 809 | while (cpumask_empty(cs->cpus_allowed)) |
815 | (cgroup_cs(scan->cg))->cpus_allowed); | 810 | cs = parent_cs(cs); |
811 | return cs; | ||
812 | } | ||
813 | |||
814 | /* | ||
815 | * effective_nodemask_cpuset - return nearest ancestor with non-empty mems | ||
816 | * @cs: the cpuset in interest | ||
817 | * | ||
818 | * A cpuset's effective nodemask is the nodemask of the nearest ancestor | ||
819 | * with non-empty memss. We use effective nodemask whenever: | ||
820 | * - we update tasks' mems_allowed. (they take on the ancestor's nodemask | ||
821 | * if the cpuset they reside in has no mems) | ||
822 | * - we want to retrieve task_cs(tsk)'s mems_allowed. | ||
823 | * | ||
824 | * Called with cpuset_mutex held. | ||
825 | */ | ||
826 | static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) | ||
827 | { | ||
828 | while (nodes_empty(cs->mems_allowed)) | ||
829 | cs = parent_cs(cs); | ||
830 | return cs; | ||
816 | } | 831 | } |
817 | 832 | ||
818 | /** | 833 | /** |
@@ -829,7 +844,10 @@ static int cpuset_test_cpumask(struct task_struct *tsk, | |||
829 | static void cpuset_change_cpumask(struct task_struct *tsk, | 844 | static void cpuset_change_cpumask(struct task_struct *tsk, |
830 | struct cgroup_scanner *scan) | 845 | struct cgroup_scanner *scan) |
831 | { | 846 | { |
832 | set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed)); | 847 | struct cpuset *cpus_cs; |
848 | |||
849 | cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg)); | ||
850 | set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); | ||
833 | } | 851 | } |
834 | 852 | ||
835 | /** | 853 | /** |
@@ -850,12 +868,51 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) | |||
850 | struct cgroup_scanner scan; | 868 | struct cgroup_scanner scan; |
851 | 869 | ||
852 | scan.cg = cs->css.cgroup; | 870 | scan.cg = cs->css.cgroup; |
853 | scan.test_task = cpuset_test_cpumask; | 871 | scan.test_task = NULL; |
854 | scan.process_task = cpuset_change_cpumask; | 872 | scan.process_task = cpuset_change_cpumask; |
855 | scan.heap = heap; | 873 | scan.heap = heap; |
856 | cgroup_scan_tasks(&scan); | 874 | cgroup_scan_tasks(&scan); |
857 | } | 875 | } |
858 | 876 | ||
877 | /* | ||
878 | * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. | ||
879 | * @root_cs: the root cpuset of the hierarchy | ||
880 | * @update_root: update root cpuset or not? | ||
881 | * @heap: the heap used by cgroup_scan_tasks() | ||
882 | * | ||
883 | * This will update cpumasks of tasks in @root_cs and all other empty cpusets | ||
884 | * which take on cpumask of @root_cs. | ||
885 | * | ||
886 | * Called with cpuset_mutex held | ||
887 | */ | ||
888 | static void update_tasks_cpumask_hier(struct cpuset *root_cs, | ||
889 | bool update_root, struct ptr_heap *heap) | ||
890 | { | ||
891 | struct cpuset *cp; | ||
892 | struct cgroup *pos_cgrp; | ||
893 | |||
894 | if (update_root) | ||
895 | update_tasks_cpumask(root_cs, heap); | ||
896 | |||
897 | rcu_read_lock(); | ||
898 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | ||
899 | /* skip the whole subtree if @cp have some CPU */ | ||
900 | if (!cpumask_empty(cp->cpus_allowed)) { | ||
901 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | ||
902 | continue; | ||
903 | } | ||
904 | if (!css_tryget(&cp->css)) | ||
905 | continue; | ||
906 | rcu_read_unlock(); | ||
907 | |||
908 | update_tasks_cpumask(cp, heap); | ||
909 | |||
910 | rcu_read_lock(); | ||
911 | css_put(&cp->css); | ||
912 | } | ||
913 | rcu_read_unlock(); | ||
914 | } | ||
915 | |||
859 | /** | 916 | /** |
860 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it | 917 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it |
861 | * @cs: the cpuset to consider | 918 | * @cs: the cpuset to consider |
@@ -888,14 +945,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
888 | if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) | 945 | if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) |
889 | return -EINVAL; | 946 | return -EINVAL; |
890 | } | 947 | } |
891 | retval = validate_change(cs, trialcs); | ||
892 | if (retval < 0) | ||
893 | return retval; | ||
894 | 948 | ||
895 | /* Nothing to do if the cpus didn't change */ | 949 | /* Nothing to do if the cpus didn't change */ |
896 | if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) | 950 | if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) |
897 | return 0; | 951 | return 0; |
898 | 952 | ||
953 | retval = validate_change(cs, trialcs); | ||
954 | if (retval < 0) | ||
955 | return retval; | ||
956 | |||
899 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); | 957 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); |
900 | if (retval) | 958 | if (retval) |
901 | return retval; | 959 | return retval; |
@@ -906,11 +964,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
906 | cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); | 964 | cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); |
907 | mutex_unlock(&callback_mutex); | 965 | mutex_unlock(&callback_mutex); |
908 | 966 | ||
909 | /* | 967 | update_tasks_cpumask_hier(cs, true, &heap); |
910 | * Scan tasks in the cpuset, and update the cpumasks of any | ||
911 | * that need an update. | ||
912 | */ | ||
913 | update_tasks_cpumask(cs, &heap); | ||
914 | 968 | ||
915 | heap_free(&heap); | 969 | heap_free(&heap); |
916 | 970 | ||
@@ -943,12 +997,14 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | |||
943 | const nodemask_t *to) | 997 | const nodemask_t *to) |
944 | { | 998 | { |
945 | struct task_struct *tsk = current; | 999 | struct task_struct *tsk = current; |
1000 | struct cpuset *mems_cs; | ||
946 | 1001 | ||
947 | tsk->mems_allowed = *to; | 1002 | tsk->mems_allowed = *to; |
948 | 1003 | ||
949 | do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); | 1004 | do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); |
950 | 1005 | ||
951 | guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); | 1006 | mems_cs = effective_nodemask_cpuset(task_cs(tsk)); |
1007 | guarantee_online_mems(mems_cs, &tsk->mems_allowed); | ||
952 | } | 1008 | } |
953 | 1009 | ||
954 | /* | 1010 | /* |
@@ -1007,16 +1063,12 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, | |||
1007 | static void cpuset_change_nodemask(struct task_struct *p, | 1063 | static void cpuset_change_nodemask(struct task_struct *p, |
1008 | struct cgroup_scanner *scan) | 1064 | struct cgroup_scanner *scan) |
1009 | { | 1065 | { |
1066 | struct cpuset *cs = cgroup_cs(scan->cg); | ||
1010 | struct mm_struct *mm; | 1067 | struct mm_struct *mm; |
1011 | struct cpuset *cs; | ||
1012 | int migrate; | 1068 | int migrate; |
1013 | const nodemask_t *oldmem = scan->data; | 1069 | nodemask_t *newmems = scan->data; |
1014 | static nodemask_t newmems; /* protected by cpuset_mutex */ | ||
1015 | |||
1016 | cs = cgroup_cs(scan->cg); | ||
1017 | guarantee_online_mems(cs, &newmems); | ||
1018 | 1070 | ||
1019 | cpuset_change_task_nodemask(p, &newmems); | 1071 | cpuset_change_task_nodemask(p, newmems); |
1020 | 1072 | ||
1021 | mm = get_task_mm(p); | 1073 | mm = get_task_mm(p); |
1022 | if (!mm) | 1074 | if (!mm) |
@@ -1026,7 +1078,7 @@ static void cpuset_change_nodemask(struct task_struct *p, | |||
1026 | 1078 | ||
1027 | mpol_rebind_mm(mm, &cs->mems_allowed); | 1079 | mpol_rebind_mm(mm, &cs->mems_allowed); |
1028 | if (migrate) | 1080 | if (migrate) |
1029 | cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); | 1081 | cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems); |
1030 | mmput(mm); | 1082 | mmput(mm); |
1031 | } | 1083 | } |
1032 | 1084 | ||
@@ -1035,25 +1087,27 @@ static void *cpuset_being_rebound; | |||
1035 | /** | 1087 | /** |
1036 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. | 1088 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. |
1037 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed | 1089 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed |
1038 | * @oldmem: old mems_allowed of cpuset cs | ||
1039 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 1090 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() |
1040 | * | 1091 | * |
1041 | * Called with cpuset_mutex held | 1092 | * Called with cpuset_mutex held |
1042 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | 1093 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 |
1043 | * if @heap != NULL. | 1094 | * if @heap != NULL. |
1044 | */ | 1095 | */ |
1045 | static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, | 1096 | static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) |
1046 | struct ptr_heap *heap) | ||
1047 | { | 1097 | { |
1098 | static nodemask_t newmems; /* protected by cpuset_mutex */ | ||
1048 | struct cgroup_scanner scan; | 1099 | struct cgroup_scanner scan; |
1100 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); | ||
1049 | 1101 | ||
1050 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ | 1102 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ |
1051 | 1103 | ||
1104 | guarantee_online_mems(mems_cs, &newmems); | ||
1105 | |||
1052 | scan.cg = cs->css.cgroup; | 1106 | scan.cg = cs->css.cgroup; |
1053 | scan.test_task = NULL; | 1107 | scan.test_task = NULL; |
1054 | scan.process_task = cpuset_change_nodemask; | 1108 | scan.process_task = cpuset_change_nodemask; |
1055 | scan.heap = heap; | 1109 | scan.heap = heap; |
1056 | scan.data = (nodemask_t *)oldmem; | 1110 | scan.data = &newmems; |
1057 | 1111 | ||
1058 | /* | 1112 | /* |
1059 | * The mpol_rebind_mm() call takes mmap_sem, which we couldn't | 1113 | * The mpol_rebind_mm() call takes mmap_sem, which we couldn't |
@@ -1067,11 +1121,56 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, | |||
1067 | */ | 1121 | */ |
1068 | cgroup_scan_tasks(&scan); | 1122 | cgroup_scan_tasks(&scan); |
1069 | 1123 | ||
1124 | /* | ||
1125 | * All the tasks' nodemasks have been updated, update | ||
1126 | * cs->old_mems_allowed. | ||
1127 | */ | ||
1128 | cs->old_mems_allowed = newmems; | ||
1129 | |||
1070 | /* We're done rebinding vmas to this cpuset's new mems_allowed. */ | 1130 | /* We're done rebinding vmas to this cpuset's new mems_allowed. */ |
1071 | cpuset_being_rebound = NULL; | 1131 | cpuset_being_rebound = NULL; |
1072 | } | 1132 | } |
1073 | 1133 | ||
1074 | /* | 1134 | /* |
1135 | * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. | ||
1136 | * @cs: the root cpuset of the hierarchy | ||
1137 | * @update_root: update the root cpuset or not? | ||
1138 | * @heap: the heap used by cgroup_scan_tasks() | ||
1139 | * | ||
1140 | * This will update nodemasks of tasks in @root_cs and all other empty cpusets | ||
1141 | * which take on nodemask of @root_cs. | ||
1142 | * | ||
1143 | * Called with cpuset_mutex held | ||
1144 | */ | ||
1145 | static void update_tasks_nodemask_hier(struct cpuset *root_cs, | ||
1146 | bool update_root, struct ptr_heap *heap) | ||
1147 | { | ||
1148 | struct cpuset *cp; | ||
1149 | struct cgroup *pos_cgrp; | ||
1150 | |||
1151 | if (update_root) | ||
1152 | update_tasks_nodemask(root_cs, heap); | ||
1153 | |||
1154 | rcu_read_lock(); | ||
1155 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | ||
1156 | /* skip the whole subtree if @cp have some CPU */ | ||
1157 | if (!nodes_empty(cp->mems_allowed)) { | ||
1158 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | ||
1159 | continue; | ||
1160 | } | ||
1161 | if (!css_tryget(&cp->css)) | ||
1162 | continue; | ||
1163 | rcu_read_unlock(); | ||
1164 | |||
1165 | update_tasks_nodemask(cp, heap); | ||
1166 | |||
1167 | rcu_read_lock(); | ||
1168 | css_put(&cp->css); | ||
1169 | } | ||
1170 | rcu_read_unlock(); | ||
1171 | } | ||
1172 | |||
1173 | /* | ||
1075 | * Handle user request to change the 'mems' memory placement | 1174 | * Handle user request to change the 'mems' memory placement |
1076 | * of a cpuset. Needs to validate the request, update the | 1175 | * of a cpuset. Needs to validate the request, update the |
1077 | * cpusets mems_allowed, and for each task in the cpuset, | 1176 | * cpusets mems_allowed, and for each task in the cpuset, |
@@ -1087,13 +1186,9 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, | |||
1087 | static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | 1186 | static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, |
1088 | const char *buf) | 1187 | const char *buf) |
1089 | { | 1188 | { |
1090 | NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL); | ||
1091 | int retval; | 1189 | int retval; |
1092 | struct ptr_heap heap; | 1190 | struct ptr_heap heap; |
1093 | 1191 | ||
1094 | if (!oldmem) | ||
1095 | return -ENOMEM; | ||
1096 | |||
1097 | /* | 1192 | /* |
1098 | * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; | 1193 | * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; |
1099 | * it's read-only | 1194 | * it's read-only |
@@ -1122,8 +1217,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1122 | goto done; | 1217 | goto done; |
1123 | } | 1218 | } |
1124 | } | 1219 | } |
1125 | *oldmem = cs->mems_allowed; | 1220 | |
1126 | if (nodes_equal(*oldmem, trialcs->mems_allowed)) { | 1221 | if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { |
1127 | retval = 0; /* Too easy - nothing to do */ | 1222 | retval = 0; /* Too easy - nothing to do */ |
1128 | goto done; | 1223 | goto done; |
1129 | } | 1224 | } |
@@ -1139,11 +1234,10 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1139 | cs->mems_allowed = trialcs->mems_allowed; | 1234 | cs->mems_allowed = trialcs->mems_allowed; |
1140 | mutex_unlock(&callback_mutex); | 1235 | mutex_unlock(&callback_mutex); |
1141 | 1236 | ||
1142 | update_tasks_nodemask(cs, oldmem, &heap); | 1237 | update_tasks_nodemask_hier(cs, true, &heap); |
1143 | 1238 | ||
1144 | heap_free(&heap); | 1239 | heap_free(&heap); |
1145 | done: | 1240 | done: |
1146 | NODEMASK_FREE(oldmem); | ||
1147 | return retval; | 1241 | return retval; |
1148 | } | 1242 | } |
1149 | 1243 | ||
@@ -1372,8 +1466,13 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
1372 | 1466 | ||
1373 | mutex_lock(&cpuset_mutex); | 1467 | mutex_lock(&cpuset_mutex); |
1374 | 1468 | ||
1469 | /* | ||
1470 | * We allow to move tasks into an empty cpuset if sane_behavior | ||
1471 | * flag is set. | ||
1472 | */ | ||
1375 | ret = -ENOSPC; | 1473 | ret = -ENOSPC; |
1376 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | 1474 | if (!cgroup_sane_behavior(cgrp) && |
1475 | (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) | ||
1377 | goto out_unlock; | 1476 | goto out_unlock; |
1378 | 1477 | ||
1379 | cgroup_taskset_for_each(task, cgrp, tset) { | 1478 | cgroup_taskset_for_each(task, cgrp, tset) { |
@@ -1422,8 +1521,7 @@ static cpumask_var_t cpus_attach; | |||
1422 | 1521 | ||
1423 | static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 1522 | static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) |
1424 | { | 1523 | { |
1425 | /* static bufs protected by cpuset_mutex */ | 1524 | /* static buf protected by cpuset_mutex */ |
1426 | static nodemask_t cpuset_attach_nodemask_from; | ||
1427 | static nodemask_t cpuset_attach_nodemask_to; | 1525 | static nodemask_t cpuset_attach_nodemask_to; |
1428 | struct mm_struct *mm; | 1526 | struct mm_struct *mm; |
1429 | struct task_struct *task; | 1527 | struct task_struct *task; |
@@ -1431,6 +1529,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
1431 | struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); | 1529 | struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); |
1432 | struct cpuset *cs = cgroup_cs(cgrp); | 1530 | struct cpuset *cs = cgroup_cs(cgrp); |
1433 | struct cpuset *oldcs = cgroup_cs(oldcgrp); | 1531 | struct cpuset *oldcs = cgroup_cs(oldcgrp); |
1532 | struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); | ||
1533 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); | ||
1434 | 1534 | ||
1435 | mutex_lock(&cpuset_mutex); | 1535 | mutex_lock(&cpuset_mutex); |
1436 | 1536 | ||
@@ -1438,9 +1538,9 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
1438 | if (cs == &top_cpuset) | 1538 | if (cs == &top_cpuset) |
1439 | cpumask_copy(cpus_attach, cpu_possible_mask); | 1539 | cpumask_copy(cpus_attach, cpu_possible_mask); |
1440 | else | 1540 | else |
1441 | guarantee_online_cpus(cs, cpus_attach); | 1541 | guarantee_online_cpus(cpus_cs, cpus_attach); |
1442 | 1542 | ||
1443 | guarantee_online_mems(cs, &cpuset_attach_nodemask_to); | 1543 | guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); |
1444 | 1544 | ||
1445 | cgroup_taskset_for_each(task, cgrp, tset) { | 1545 | cgroup_taskset_for_each(task, cgrp, tset) { |
1446 | /* | 1546 | /* |
@@ -1457,26 +1557,32 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
1457 | * Change mm, possibly for multiple threads in a threadgroup. This is | 1557 | * Change mm, possibly for multiple threads in a threadgroup. This is |
1458 | * expensive and may sleep. | 1558 | * expensive and may sleep. |
1459 | */ | 1559 | */ |
1460 | cpuset_attach_nodemask_from = oldcs->mems_allowed; | ||
1461 | cpuset_attach_nodemask_to = cs->mems_allowed; | 1560 | cpuset_attach_nodemask_to = cs->mems_allowed; |
1462 | mm = get_task_mm(leader); | 1561 | mm = get_task_mm(leader); |
1463 | if (mm) { | 1562 | if (mm) { |
1563 | struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs); | ||
1564 | |||
1464 | mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); | 1565 | mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); |
1465 | if (is_memory_migrate(cs)) | 1566 | |
1466 | cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, | 1567 | /* |
1568 | * old_mems_allowed is the same with mems_allowed here, except | ||
1569 | * if this task is being moved automatically due to hotplug. | ||
1570 | * In that case @mems_allowed has been updated and is empty, | ||
1571 | * so @old_mems_allowed is the right nodesets that we migrate | ||
1572 | * mm from. | ||
1573 | */ | ||
1574 | if (is_memory_migrate(cs)) { | ||
1575 | cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed, | ||
1467 | &cpuset_attach_nodemask_to); | 1576 | &cpuset_attach_nodemask_to); |
1577 | } | ||
1468 | mmput(mm); | 1578 | mmput(mm); |
1469 | } | 1579 | } |
1470 | 1580 | ||
1471 | cs->attach_in_progress--; | 1581 | cs->old_mems_allowed = cpuset_attach_nodemask_to; |
1472 | 1582 | ||
1473 | /* | 1583 | cs->attach_in_progress--; |
1474 | * We may have raced with CPU/memory hotunplug. Trigger hotplug | 1584 | if (!cs->attach_in_progress) |
1475 | * propagation if @cs doesn't have any CPU or memory. It will move | 1585 | wake_up(&cpuset_attach_wq); |
1476 | * the newly added tasks to the nearest parent which can execute. | ||
1477 | */ | ||
1478 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | ||
1479 | schedule_cpuset_propagate_hotplug(cs); | ||
1480 | 1586 | ||
1481 | mutex_unlock(&cpuset_mutex); | 1587 | mutex_unlock(&cpuset_mutex); |
1482 | } | 1588 | } |
@@ -1588,13 +1694,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | |||
1588 | * resources, wait for the previously scheduled operations before | 1694 | * resources, wait for the previously scheduled operations before |
1589 | * proceeding, so that we don't end up keep removing tasks added | 1695 | * proceeding, so that we don't end up keep removing tasks added |
1590 | * after execution capability is restored. | 1696 | * after execution capability is restored. |
1591 | * | ||
1592 | * Flushing cpuset_hotplug_work is enough to synchronize against | ||
1593 | * hotplug hanlding; however, cpuset_attach() may schedule | ||
1594 | * propagation work directly. Flush the workqueue too. | ||
1595 | */ | 1697 | */ |
1596 | flush_work(&cpuset_hotplug_work); | 1698 | flush_work(&cpuset_hotplug_work); |
1597 | flush_workqueue(cpuset_propagate_hotplug_wq); | ||
1598 | 1699 | ||
1599 | mutex_lock(&cpuset_mutex); | 1700 | mutex_lock(&cpuset_mutex); |
1600 | if (!is_cpuset_online(cs)) | 1701 | if (!is_cpuset_online(cs)) |
@@ -1658,13 +1759,13 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
1658 | return count; | 1759 | return count; |
1659 | } | 1760 | } |
1660 | 1761 | ||
1661 | static ssize_t cpuset_common_file_read(struct cgroup *cont, | 1762 | static ssize_t cpuset_common_file_read(struct cgroup *cgrp, |
1662 | struct cftype *cft, | 1763 | struct cftype *cft, |
1663 | struct file *file, | 1764 | struct file *file, |
1664 | char __user *buf, | 1765 | char __user *buf, |
1665 | size_t nbytes, loff_t *ppos) | 1766 | size_t nbytes, loff_t *ppos) |
1666 | { | 1767 | { |
1667 | struct cpuset *cs = cgroup_cs(cont); | 1768 | struct cpuset *cs = cgroup_cs(cgrp); |
1668 | cpuset_filetype_t type = cft->private; | 1769 | cpuset_filetype_t type = cft->private; |
1669 | char *page; | 1770 | char *page; |
1670 | ssize_t retval = 0; | 1771 | ssize_t retval = 0; |
@@ -1694,9 +1795,9 @@ out: | |||
1694 | return retval; | 1795 | return retval; |
1695 | } | 1796 | } |
1696 | 1797 | ||
1697 | static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) | 1798 | static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) |
1698 | { | 1799 | { |
1699 | struct cpuset *cs = cgroup_cs(cont); | 1800 | struct cpuset *cs = cgroup_cs(cgrp); |
1700 | cpuset_filetype_t type = cft->private; | 1801 | cpuset_filetype_t type = cft->private; |
1701 | switch (type) { | 1802 | switch (type) { |
1702 | case FILE_CPU_EXCLUSIVE: | 1803 | case FILE_CPU_EXCLUSIVE: |
@@ -1725,9 +1826,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) | |||
1725 | return 0; | 1826 | return 0; |
1726 | } | 1827 | } |
1727 | 1828 | ||
1728 | static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) | 1829 | static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft) |
1729 | { | 1830 | { |
1730 | struct cpuset *cs = cgroup_cs(cont); | 1831 | struct cpuset *cs = cgroup_cs(cgrp); |
1731 | cpuset_filetype_t type = cft->private; | 1832 | cpuset_filetype_t type = cft->private; |
1732 | switch (type) { | 1833 | switch (type) { |
1733 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | 1834 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: |
@@ -1839,14 +1940,14 @@ static struct cftype files[] = { | |||
1839 | 1940 | ||
1840 | /* | 1941 | /* |
1841 | * cpuset_css_alloc - allocate a cpuset css | 1942 | * cpuset_css_alloc - allocate a cpuset css |
1842 | * cont: control group that the new cpuset will be part of | 1943 | * cgrp: control group that the new cpuset will be part of |
1843 | */ | 1944 | */ |
1844 | 1945 | ||
1845 | static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) | 1946 | static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) |
1846 | { | 1947 | { |
1847 | struct cpuset *cs; | 1948 | struct cpuset *cs; |
1848 | 1949 | ||
1849 | if (!cont->parent) | 1950 | if (!cgrp->parent) |
1850 | return &top_cpuset.css; | 1951 | return &top_cpuset.css; |
1851 | 1952 | ||
1852 | cs = kzalloc(sizeof(*cs), GFP_KERNEL); | 1953 | cs = kzalloc(sizeof(*cs), GFP_KERNEL); |
@@ -1861,7 +1962,6 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) | |||
1861 | cpumask_clear(cs->cpus_allowed); | 1962 | cpumask_clear(cs->cpus_allowed); |
1862 | nodes_clear(cs->mems_allowed); | 1963 | nodes_clear(cs->mems_allowed); |
1863 | fmeter_init(&cs->fmeter); | 1964 | fmeter_init(&cs->fmeter); |
1864 | INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn); | ||
1865 | cs->relax_domain_level = -1; | 1965 | cs->relax_domain_level = -1; |
1866 | 1966 | ||
1867 | return &cs->css; | 1967 | return &cs->css; |
@@ -1942,9 +2042,9 @@ static void cpuset_css_offline(struct cgroup *cgrp) | |||
1942 | * will call rebuild_sched_domains_locked(). | 2042 | * will call rebuild_sched_domains_locked(). |
1943 | */ | 2043 | */ |
1944 | 2044 | ||
1945 | static void cpuset_css_free(struct cgroup *cont) | 2045 | static void cpuset_css_free(struct cgroup *cgrp) |
1946 | { | 2046 | { |
1947 | struct cpuset *cs = cgroup_cs(cont); | 2047 | struct cpuset *cs = cgroup_cs(cgrp); |
1948 | 2048 | ||
1949 | free_cpumask_var(cs->cpus_allowed); | 2049 | free_cpumask_var(cs->cpus_allowed); |
1950 | kfree(cs); | 2050 | kfree(cs); |
@@ -2024,41 +2124,64 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
2024 | } | 2124 | } |
2025 | 2125 | ||
2026 | /** | 2126 | /** |
2027 | * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset | 2127 | * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug |
2028 | * @cs: cpuset in interest | 2128 | * @cs: cpuset in interest |
2029 | * | 2129 | * |
2030 | * Compare @cs's cpu and mem masks against top_cpuset and if some have gone | 2130 | * Compare @cs's cpu and mem masks against top_cpuset and if some have gone |
2031 | * offline, update @cs accordingly. If @cs ends up with no CPU or memory, | 2131 | * offline, update @cs accordingly. If @cs ends up with no CPU or memory, |
2032 | * all its tasks are moved to the nearest ancestor with both resources. | 2132 | * all its tasks are moved to the nearest ancestor with both resources. |
2033 | */ | 2133 | */ |
2034 | static void cpuset_propagate_hotplug_workfn(struct work_struct *work) | 2134 | static void cpuset_hotplug_update_tasks(struct cpuset *cs) |
2035 | { | 2135 | { |
2036 | static cpumask_t off_cpus; | 2136 | static cpumask_t off_cpus; |
2037 | static nodemask_t off_mems, tmp_mems; | 2137 | static nodemask_t off_mems; |
2038 | struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); | ||
2039 | bool is_empty; | 2138 | bool is_empty; |
2139 | bool sane = cgroup_sane_behavior(cs->css.cgroup); | ||
2140 | |||
2141 | retry: | ||
2142 | wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); | ||
2040 | 2143 | ||
2041 | mutex_lock(&cpuset_mutex); | 2144 | mutex_lock(&cpuset_mutex); |
2042 | 2145 | ||
2146 | /* | ||
2147 | * We have raced with task attaching. We wait until attaching | ||
2148 | * is finished, so we won't attach a task to an empty cpuset. | ||
2149 | */ | ||
2150 | if (cs->attach_in_progress) { | ||
2151 | mutex_unlock(&cpuset_mutex); | ||
2152 | goto retry; | ||
2153 | } | ||
2154 | |||
2043 | cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); | 2155 | cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); |
2044 | nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); | 2156 | nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); |
2045 | 2157 | ||
2046 | /* remove offline cpus from @cs */ | 2158 | mutex_lock(&callback_mutex); |
2047 | if (!cpumask_empty(&off_cpus)) { | 2159 | cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); |
2048 | mutex_lock(&callback_mutex); | 2160 | mutex_unlock(&callback_mutex); |
2049 | cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); | 2161 | |
2050 | mutex_unlock(&callback_mutex); | 2162 | /* |
2163 | * If sane_behavior flag is set, we need to update tasks' cpumask | ||
2164 | * for empty cpuset to take on ancestor's cpumask. Otherwise, don't | ||
2165 | * call update_tasks_cpumask() if the cpuset becomes empty, as | ||
2166 | * the tasks in it will be migrated to an ancestor. | ||
2167 | */ | ||
2168 | if ((sane && cpumask_empty(cs->cpus_allowed)) || | ||
2169 | (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) | ||
2051 | update_tasks_cpumask(cs, NULL); | 2170 | update_tasks_cpumask(cs, NULL); |
2052 | } | ||
2053 | 2171 | ||
2054 | /* remove offline mems from @cs */ | 2172 | mutex_lock(&callback_mutex); |
2055 | if (!nodes_empty(off_mems)) { | 2173 | nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); |
2056 | tmp_mems = cs->mems_allowed; | 2174 | mutex_unlock(&callback_mutex); |
2057 | mutex_lock(&callback_mutex); | 2175 | |
2058 | nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); | 2176 | /* |
2059 | mutex_unlock(&callback_mutex); | 2177 | * If sane_behavior flag is set, we need to update tasks' nodemask |
2060 | update_tasks_nodemask(cs, &tmp_mems, NULL); | 2178 | * for empty cpuset to take on ancestor's nodemask. Otherwise, don't |
2061 | } | 2179 | * call update_tasks_nodemask() if the cpuset becomes empty, as |
2180 | * the tasks in it will be migratd to an ancestor. | ||
2181 | */ | ||
2182 | if ((sane && nodes_empty(cs->mems_allowed)) || | ||
2183 | (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) | ||
2184 | update_tasks_nodemask(cs, NULL); | ||
2062 | 2185 | ||
2063 | is_empty = cpumask_empty(cs->cpus_allowed) || | 2186 | is_empty = cpumask_empty(cs->cpus_allowed) || |
2064 | nodes_empty(cs->mems_allowed); | 2187 | nodes_empty(cs->mems_allowed); |
@@ -2066,40 +2189,14 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work) | |||
2066 | mutex_unlock(&cpuset_mutex); | 2189 | mutex_unlock(&cpuset_mutex); |
2067 | 2190 | ||
2068 | /* | 2191 | /* |
2069 | * If @cs became empty, move tasks to the nearest ancestor with | 2192 | * If sane_behavior flag is set, we'll keep tasks in empty cpusets. |
2070 | * execution resources. This is full cgroup operation which will | 2193 | * |
2194 | * Otherwise move tasks to the nearest ancestor with execution | ||
2195 | * resources. This is full cgroup operation which will | ||
2071 | * also call back into cpuset. Should be done outside any lock. | 2196 | * also call back into cpuset. Should be done outside any lock. |
2072 | */ | 2197 | */ |
2073 | if (is_empty) | 2198 | if (!sane && is_empty) |
2074 | remove_tasks_in_empty_cpuset(cs); | 2199 | remove_tasks_in_empty_cpuset(cs); |
2075 | |||
2076 | /* the following may free @cs, should be the last operation */ | ||
2077 | css_put(&cs->css); | ||
2078 | } | ||
2079 | |||
2080 | /** | ||
2081 | * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset | ||
2082 | * @cs: cpuset of interest | ||
2083 | * | ||
2084 | * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and | ||
2085 | * memory masks according to top_cpuset. | ||
2086 | */ | ||
2087 | static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) | ||
2088 | { | ||
2089 | /* | ||
2090 | * Pin @cs. The refcnt will be released when the work item | ||
2091 | * finishes executing. | ||
2092 | */ | ||
2093 | if (!css_tryget(&cs->css)) | ||
2094 | return; | ||
2095 | |||
2096 | /* | ||
2097 | * Queue @cs->hotplug_work. If already pending, lose the css ref. | ||
2098 | * cpuset_propagate_hotplug_wq is ordered and propagation will | ||
2099 | * happen in the order this function is called. | ||
2100 | */ | ||
2101 | if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work)) | ||
2102 | css_put(&cs->css); | ||
2103 | } | 2200 | } |
2104 | 2201 | ||
2105 | /** | 2202 | /** |
@@ -2112,18 +2209,17 @@ static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) | |||
2112 | * actively using CPU hotplug but making no active use of cpusets. | 2209 | * actively using CPU hotplug but making no active use of cpusets. |
2113 | * | 2210 | * |
2114 | * Non-root cpusets are only affected by offlining. If any CPUs or memory | 2211 | * Non-root cpusets are only affected by offlining. If any CPUs or memory |
2115 | * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all | 2212 | * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on |
2116 | * descendants. | 2213 | * all descendants. |
2117 | * | 2214 | * |
2118 | * Note that CPU offlining during suspend is ignored. We don't modify | 2215 | * Note that CPU offlining during suspend is ignored. We don't modify |
2119 | * cpusets across suspend/resume cycles at all. | 2216 | * cpusets across suspend/resume cycles at all. |
2120 | */ | 2217 | */ |
2121 | static void cpuset_hotplug_workfn(struct work_struct *work) | 2218 | static void cpuset_hotplug_workfn(struct work_struct *work) |
2122 | { | 2219 | { |
2123 | static cpumask_t new_cpus, tmp_cpus; | 2220 | static cpumask_t new_cpus; |
2124 | static nodemask_t new_mems, tmp_mems; | 2221 | static nodemask_t new_mems; |
2125 | bool cpus_updated, mems_updated; | 2222 | bool cpus_updated, mems_updated; |
2126 | bool cpus_offlined, mems_offlined; | ||
2127 | 2223 | ||
2128 | mutex_lock(&cpuset_mutex); | 2224 | mutex_lock(&cpuset_mutex); |
2129 | 2225 | ||
@@ -2132,12 +2228,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
2132 | new_mems = node_states[N_MEMORY]; | 2228 | new_mems = node_states[N_MEMORY]; |
2133 | 2229 | ||
2134 | cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); | 2230 | cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); |
2135 | cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed, | ||
2136 | &new_cpus); | ||
2137 | |||
2138 | mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); | 2231 | mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); |
2139 | nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems); | ||
2140 | mems_offlined = !nodes_empty(tmp_mems); | ||
2141 | 2232 | ||
2142 | /* synchronize cpus_allowed to cpu_active_mask */ | 2233 | /* synchronize cpus_allowed to cpu_active_mask */ |
2143 | if (cpus_updated) { | 2234 | if (cpus_updated) { |
@@ -2149,28 +2240,32 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
2149 | 2240 | ||
2150 | /* synchronize mems_allowed to N_MEMORY */ | 2241 | /* synchronize mems_allowed to N_MEMORY */ |
2151 | if (mems_updated) { | 2242 | if (mems_updated) { |
2152 | tmp_mems = top_cpuset.mems_allowed; | ||
2153 | mutex_lock(&callback_mutex); | 2243 | mutex_lock(&callback_mutex); |
2154 | top_cpuset.mems_allowed = new_mems; | 2244 | top_cpuset.mems_allowed = new_mems; |
2155 | mutex_unlock(&callback_mutex); | 2245 | mutex_unlock(&callback_mutex); |
2156 | update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); | 2246 | update_tasks_nodemask(&top_cpuset, NULL); |
2157 | } | 2247 | } |
2158 | 2248 | ||
2159 | /* if cpus or mems went down, we need to propagate to descendants */ | 2249 | mutex_unlock(&cpuset_mutex); |
2160 | if (cpus_offlined || mems_offlined) { | 2250 | |
2251 | /* if cpus or mems changed, we need to propagate to descendants */ | ||
2252 | if (cpus_updated || mems_updated) { | ||
2161 | struct cpuset *cs; | 2253 | struct cpuset *cs; |
2162 | struct cgroup *pos_cgrp; | 2254 | struct cgroup *pos_cgrp; |
2163 | 2255 | ||
2164 | rcu_read_lock(); | 2256 | rcu_read_lock(); |
2165 | cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) | 2257 | cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { |
2166 | schedule_cpuset_propagate_hotplug(cs); | 2258 | if (!css_tryget(&cs->css)) |
2167 | rcu_read_unlock(); | 2259 | continue; |
2168 | } | 2260 | rcu_read_unlock(); |
2169 | 2261 | ||
2170 | mutex_unlock(&cpuset_mutex); | 2262 | cpuset_hotplug_update_tasks(cs); |
2171 | 2263 | ||
2172 | /* wait for propagations to finish */ | 2264 | rcu_read_lock(); |
2173 | flush_workqueue(cpuset_propagate_hotplug_wq); | 2265 | css_put(&cs->css); |
2266 | } | ||
2267 | rcu_read_unlock(); | ||
2268 | } | ||
2174 | 2269 | ||
2175 | /* rebuild sched domains if cpus_allowed has changed */ | 2270 | /* rebuild sched domains if cpus_allowed has changed */ |
2176 | if (cpus_updated) | 2271 | if (cpus_updated) |
@@ -2219,12 +2314,9 @@ void __init cpuset_init_smp(void) | |||
2219 | { | 2314 | { |
2220 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2315 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
2221 | top_cpuset.mems_allowed = node_states[N_MEMORY]; | 2316 | top_cpuset.mems_allowed = node_states[N_MEMORY]; |
2317 | top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; | ||
2222 | 2318 | ||
2223 | register_hotmemory_notifier(&cpuset_track_online_nodes_nb); | 2319 | register_hotmemory_notifier(&cpuset_track_online_nodes_nb); |
2224 | |||
2225 | cpuset_propagate_hotplug_wq = | ||
2226 | alloc_ordered_workqueue("cpuset_hotplug", 0); | ||
2227 | BUG_ON(!cpuset_propagate_hotplug_wq); | ||
2228 | } | 2320 | } |
2229 | 2321 | ||
2230 | /** | 2322 | /** |
@@ -2240,21 +2332,23 @@ void __init cpuset_init_smp(void) | |||
2240 | 2332 | ||
2241 | void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) | 2333 | void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) |
2242 | { | 2334 | { |
2335 | struct cpuset *cpus_cs; | ||
2336 | |||
2243 | mutex_lock(&callback_mutex); | 2337 | mutex_lock(&callback_mutex); |
2244 | task_lock(tsk); | 2338 | task_lock(tsk); |
2245 | guarantee_online_cpus(task_cs(tsk), pmask); | 2339 | cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); |
2340 | guarantee_online_cpus(cpus_cs, pmask); | ||
2246 | task_unlock(tsk); | 2341 | task_unlock(tsk); |
2247 | mutex_unlock(&callback_mutex); | 2342 | mutex_unlock(&callback_mutex); |
2248 | } | 2343 | } |
2249 | 2344 | ||
2250 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) | 2345 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) |
2251 | { | 2346 | { |
2252 | const struct cpuset *cs; | 2347 | const struct cpuset *cpus_cs; |
2253 | 2348 | ||
2254 | rcu_read_lock(); | 2349 | rcu_read_lock(); |
2255 | cs = task_cs(tsk); | 2350 | cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); |
2256 | if (cs) | 2351 | do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed); |
2257 | do_set_cpus_allowed(tsk, cs->cpus_allowed); | ||
2258 | rcu_read_unlock(); | 2352 | rcu_read_unlock(); |
2259 | 2353 | ||
2260 | /* | 2354 | /* |
@@ -2293,11 +2387,13 @@ void cpuset_init_current_mems_allowed(void) | |||
2293 | 2387 | ||
2294 | nodemask_t cpuset_mems_allowed(struct task_struct *tsk) | 2388 | nodemask_t cpuset_mems_allowed(struct task_struct *tsk) |
2295 | { | 2389 | { |
2390 | struct cpuset *mems_cs; | ||
2296 | nodemask_t mask; | 2391 | nodemask_t mask; |
2297 | 2392 | ||
2298 | mutex_lock(&callback_mutex); | 2393 | mutex_lock(&callback_mutex); |
2299 | task_lock(tsk); | 2394 | task_lock(tsk); |
2300 | guarantee_online_mems(task_cs(tsk), &mask); | 2395 | mems_cs = effective_nodemask_cpuset(task_cs(tsk)); |
2396 | guarantee_online_mems(mems_cs, &mask); | ||
2301 | task_unlock(tsk); | 2397 | task_unlock(tsk); |
2302 | mutex_unlock(&callback_mutex); | 2398 | mutex_unlock(&callback_mutex); |
2303 | 2399 | ||
diff --git a/kernel/events/core.c b/kernel/events/core.c index 1db3af933704..1833bc5a84a7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -182,7 +182,7 @@ void update_perf_cpu_limits(void) | |||
182 | u64 tmp = perf_sample_period_ns; | 182 | u64 tmp = perf_sample_period_ns; |
183 | 183 | ||
184 | tmp *= sysctl_perf_cpu_time_max_percent; | 184 | tmp *= sysctl_perf_cpu_time_max_percent; |
185 | tmp = do_div(tmp, 100); | 185 | do_div(tmp, 100); |
186 | atomic_set(&perf_sample_allowed_ns, tmp); | 186 | atomic_set(&perf_sample_allowed_ns, tmp); |
187 | } | 187 | } |
188 | 188 | ||
@@ -232,7 +232,7 @@ DEFINE_PER_CPU(u64, running_sample_length); | |||
232 | void perf_sample_event_took(u64 sample_len_ns) | 232 | void perf_sample_event_took(u64 sample_len_ns) |
233 | { | 233 | { |
234 | u64 avg_local_sample_len; | 234 | u64 avg_local_sample_len; |
235 | u64 local_samples_len = __get_cpu_var(running_sample_length); | 235 | u64 local_samples_len; |
236 | 236 | ||
237 | if (atomic_read(&perf_sample_allowed_ns) == 0) | 237 | if (atomic_read(&perf_sample_allowed_ns) == 0) |
238 | return; | 238 | return; |
diff --git a/kernel/exit.c b/kernel/exit.c index 7bb73f9d09db..a949819055d5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -312,17 +312,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) | |||
312 | } | 312 | } |
313 | } | 313 | } |
314 | 314 | ||
315 | void __set_special_pids(struct pid *pid) | ||
316 | { | ||
317 | struct task_struct *curr = current->group_leader; | ||
318 | |||
319 | if (task_session(curr) != pid) | ||
320 | change_pid(curr, PIDTYPE_SID, pid); | ||
321 | |||
322 | if (task_pgrp(curr) != pid) | ||
323 | change_pid(curr, PIDTYPE_PGID, pid); | ||
324 | } | ||
325 | |||
326 | /* | 315 | /* |
327 | * Let kernel threads use this to say that they allow a certain signal. | 316 | * Let kernel threads use this to say that they allow a certain signal. |
328 | * Must not be used if kthread was cloned with CLONE_SIGHAND. | 317 | * Must not be used if kthread was cloned with CLONE_SIGHAND. |
@@ -819,7 +808,7 @@ void do_exit(long code) | |||
819 | /* | 808 | /* |
820 | * FIXME: do that only when needed, using sched_exit tracepoint | 809 | * FIXME: do that only when needed, using sched_exit tracepoint |
821 | */ | 810 | */ |
822 | ptrace_put_breakpoints(tsk); | 811 | flush_ptrace_hw_breakpoint(tsk); |
823 | 812 | ||
824 | exit_notify(tsk, group_dead); | 813 | exit_notify(tsk, group_dead); |
825 | #ifdef CONFIG_NUMA | 814 | #ifdef CONFIG_NUMA |
@@ -835,7 +824,7 @@ void do_exit(long code) | |||
835 | /* | 824 | /* |
836 | * Make sure we are holding no locks: | 825 | * Make sure we are holding no locks: |
837 | */ | 826 | */ |
838 | debug_check_no_locks_held(tsk); | 827 | debug_check_no_locks_held(); |
839 | /* | 828 | /* |
840 | * We can do this unlocked here. The futex code uses this flag | 829 | * We can do this unlocked here. The futex code uses this flag |
841 | * just to verify whether the pi state cleanup has been done | 830 | * just to verify whether the pi state cleanup has been done |
diff --git a/kernel/fork.c b/kernel/fork.c index 987b28a1f01b..66635c80a813 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -365,8 +365,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
365 | mm->locked_vm = 0; | 365 | mm->locked_vm = 0; |
366 | mm->mmap = NULL; | 366 | mm->mmap = NULL; |
367 | mm->mmap_cache = NULL; | 367 | mm->mmap_cache = NULL; |
368 | mm->free_area_cache = oldmm->mmap_base; | ||
369 | mm->cached_hole_size = ~0UL; | ||
370 | mm->map_count = 0; | 368 | mm->map_count = 0; |
371 | cpumask_clear(mm_cpumask(mm)); | 369 | cpumask_clear(mm_cpumask(mm)); |
372 | mm->mm_rb = RB_ROOT; | 370 | mm->mm_rb = RB_ROOT; |
@@ -540,8 +538,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) | |||
540 | mm->nr_ptes = 0; | 538 | mm->nr_ptes = 0; |
541 | memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); | 539 | memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); |
542 | spin_lock_init(&mm->page_table_lock); | 540 | spin_lock_init(&mm->page_table_lock); |
543 | mm->free_area_cache = TASK_UNMAPPED_BASE; | ||
544 | mm->cached_hole_size = ~0UL; | ||
545 | mm_init_aio(mm); | 541 | mm_init_aio(mm); |
546 | mm_init_owner(mm, p); | 542 | mm_init_owner(mm, p); |
547 | 543 | ||
@@ -1121,6 +1117,12 @@ static void posix_cpu_timers_init(struct task_struct *tsk) | |||
1121 | INIT_LIST_HEAD(&tsk->cpu_timers[2]); | 1117 | INIT_LIST_HEAD(&tsk->cpu_timers[2]); |
1122 | } | 1118 | } |
1123 | 1119 | ||
1120 | static inline void | ||
1121 | init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) | ||
1122 | { | ||
1123 | task->pids[type].pid = pid; | ||
1124 | } | ||
1125 | |||
1124 | /* | 1126 | /* |
1125 | * This creates a new process as a copy of the old one, | 1127 | * This creates a new process as a copy of the old one, |
1126 | * but does not actually start it yet. | 1128 | * but does not actually start it yet. |
@@ -1199,8 +1201,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1199 | retval = -EAGAIN; | 1201 | retval = -EAGAIN; |
1200 | if (atomic_read(&p->real_cred->user->processes) >= | 1202 | if (atomic_read(&p->real_cred->user->processes) >= |
1201 | task_rlimit(p, RLIMIT_NPROC)) { | 1203 | task_rlimit(p, RLIMIT_NPROC)) { |
1202 | if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && | 1204 | if (p->real_cred->user != INIT_USER && |
1203 | p->real_cred->user != INIT_USER) | 1205 | !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) |
1204 | goto bad_fork_free; | 1206 | goto bad_fork_free; |
1205 | } | 1207 | } |
1206 | current->flags &= ~PF_NPROC_EXCEEDED; | 1208 | current->flags &= ~PF_NPROC_EXCEEDED; |
@@ -1354,11 +1356,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1354 | goto bad_fork_cleanup_io; | 1356 | goto bad_fork_cleanup_io; |
1355 | } | 1357 | } |
1356 | 1358 | ||
1357 | p->pid = pid_nr(pid); | ||
1358 | p->tgid = p->pid; | ||
1359 | if (clone_flags & CLONE_THREAD) | ||
1360 | p->tgid = current->tgid; | ||
1361 | |||
1362 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; | 1359 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; |
1363 | /* | 1360 | /* |
1364 | * Clear TID on mm_release()? | 1361 | * Clear TID on mm_release()? |
@@ -1394,12 +1391,19 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1394 | clear_all_latency_tracing(p); | 1391 | clear_all_latency_tracing(p); |
1395 | 1392 | ||
1396 | /* ok, now we should be set up.. */ | 1393 | /* ok, now we should be set up.. */ |
1397 | if (clone_flags & CLONE_THREAD) | 1394 | p->pid = pid_nr(pid); |
1395 | if (clone_flags & CLONE_THREAD) { | ||
1398 | p->exit_signal = -1; | 1396 | p->exit_signal = -1; |
1399 | else if (clone_flags & CLONE_PARENT) | 1397 | p->group_leader = current->group_leader; |
1400 | p->exit_signal = current->group_leader->exit_signal; | 1398 | p->tgid = current->tgid; |
1401 | else | 1399 | } else { |
1402 | p->exit_signal = (clone_flags & CSIGNAL); | 1400 | if (clone_flags & CLONE_PARENT) |
1401 | p->exit_signal = current->group_leader->exit_signal; | ||
1402 | else | ||
1403 | p->exit_signal = (clone_flags & CSIGNAL); | ||
1404 | p->group_leader = p; | ||
1405 | p->tgid = p->pid; | ||
1406 | } | ||
1403 | 1407 | ||
1404 | p->pdeath_signal = 0; | 1408 | p->pdeath_signal = 0; |
1405 | p->exit_state = 0; | 1409 | p->exit_state = 0; |
@@ -1408,15 +1412,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1408 | p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); | 1412 | p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); |
1409 | p->dirty_paused_when = 0; | 1413 | p->dirty_paused_when = 0; |
1410 | 1414 | ||
1411 | /* | ||
1412 | * Ok, make it visible to the rest of the system. | ||
1413 | * We dont wake it up yet. | ||
1414 | */ | ||
1415 | p->group_leader = p; | ||
1416 | INIT_LIST_HEAD(&p->thread_group); | 1415 | INIT_LIST_HEAD(&p->thread_group); |
1417 | p->task_works = NULL; | 1416 | p->task_works = NULL; |
1418 | 1417 | ||
1419 | /* Need tasklist lock for parent etc handling! */ | 1418 | /* |
1419 | * Make it visible to the rest of the system, but dont wake it up yet. | ||
1420 | * Need tasklist lock for parent etc handling! | ||
1421 | */ | ||
1420 | write_lock_irq(&tasklist_lock); | 1422 | write_lock_irq(&tasklist_lock); |
1421 | 1423 | ||
1422 | /* CLONE_PARENT re-uses the old parent */ | 1424 | /* CLONE_PARENT re-uses the old parent */ |
@@ -1446,18 +1448,14 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1446 | goto bad_fork_free_pid; | 1448 | goto bad_fork_free_pid; |
1447 | } | 1449 | } |
1448 | 1450 | ||
1449 | if (clone_flags & CLONE_THREAD) { | ||
1450 | current->signal->nr_threads++; | ||
1451 | atomic_inc(¤t->signal->live); | ||
1452 | atomic_inc(¤t->signal->sigcnt); | ||
1453 | p->group_leader = current->group_leader; | ||
1454 | list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); | ||
1455 | } | ||
1456 | |||
1457 | if (likely(p->pid)) { | 1451 | if (likely(p->pid)) { |
1458 | ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); | 1452 | ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); |
1459 | 1453 | ||
1454 | init_task_pid(p, PIDTYPE_PID, pid); | ||
1460 | if (thread_group_leader(p)) { | 1455 | if (thread_group_leader(p)) { |
1456 | init_task_pid(p, PIDTYPE_PGID, task_pgrp(current)); | ||
1457 | init_task_pid(p, PIDTYPE_SID, task_session(current)); | ||
1458 | |||
1461 | if (is_child_reaper(pid)) { | 1459 | if (is_child_reaper(pid)) { |
1462 | ns_of_pid(pid)->child_reaper = p; | 1460 | ns_of_pid(pid)->child_reaper = p; |
1463 | p->signal->flags |= SIGNAL_UNKILLABLE; | 1461 | p->signal->flags |= SIGNAL_UNKILLABLE; |
@@ -1465,13 +1463,19 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1465 | 1463 | ||
1466 | p->signal->leader_pid = pid; | 1464 | p->signal->leader_pid = pid; |
1467 | p->signal->tty = tty_kref_get(current->signal->tty); | 1465 | p->signal->tty = tty_kref_get(current->signal->tty); |
1468 | attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); | ||
1469 | attach_pid(p, PIDTYPE_SID, task_session(current)); | ||
1470 | list_add_tail(&p->sibling, &p->real_parent->children); | 1466 | list_add_tail(&p->sibling, &p->real_parent->children); |
1471 | list_add_tail_rcu(&p->tasks, &init_task.tasks); | 1467 | list_add_tail_rcu(&p->tasks, &init_task.tasks); |
1468 | attach_pid(p, PIDTYPE_PGID); | ||
1469 | attach_pid(p, PIDTYPE_SID); | ||
1472 | __this_cpu_inc(process_counts); | 1470 | __this_cpu_inc(process_counts); |
1471 | } else { | ||
1472 | current->signal->nr_threads++; | ||
1473 | atomic_inc(¤t->signal->live); | ||
1474 | atomic_inc(¤t->signal->sigcnt); | ||
1475 | list_add_tail_rcu(&p->thread_group, | ||
1476 | &p->group_leader->thread_group); | ||
1473 | } | 1477 | } |
1474 | attach_pid(p, PIDTYPE_PID, pid); | 1478 | attach_pid(p, PIDTYPE_PID); |
1475 | nr_threads++; | 1479 | nr_threads++; |
1476 | } | 1480 | } |
1477 | 1481 | ||
diff --git a/kernel/freezer.c b/kernel/freezer.c index c38893b0efba..8b2afc1c9df0 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
@@ -110,6 +110,18 @@ bool freeze_task(struct task_struct *p) | |||
110 | { | 110 | { |
111 | unsigned long flags; | 111 | unsigned long flags; |
112 | 112 | ||
113 | /* | ||
114 | * This check can race with freezer_do_not_count, but worst case that | ||
115 | * will result in an extra wakeup being sent to the task. It does not | ||
116 | * race with freezer_count(), the barriers in freezer_count() and | ||
117 | * freezer_should_skip() ensure that either freezer_count() sees | ||
118 | * freezing == true in try_to_freeze() and freezes, or | ||
119 | * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task | ||
120 | * normally. | ||
121 | */ | ||
122 | if (freezer_should_skip(p)) | ||
123 | return false; | ||
124 | |||
113 | spin_lock_irqsave(&freezer_lock, flags); | 125 | spin_lock_irqsave(&freezer_lock, flags); |
114 | if (!freezing(p) || frozen(p)) { | 126 | if (!freezing(p) || frozen(p)) { |
115 | spin_unlock_irqrestore(&freezer_lock, flags); | 127 | spin_unlock_irqrestore(&freezer_lock, flags); |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index fd4b13b131f8..f0f4fe29cd21 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/sched/sysctl.h> | 47 | #include <linux/sched/sysctl.h> |
48 | #include <linux/sched/rt.h> | 48 | #include <linux/sched/rt.h> |
49 | #include <linux/timer.h> | 49 | #include <linux/timer.h> |
50 | #include <linux/freezer.h> | ||
50 | 51 | ||
51 | #include <asm/uaccess.h> | 52 | #include <asm/uaccess.h> |
52 | 53 | ||
@@ -721,17 +722,20 @@ static int hrtimer_switch_to_hres(void) | |||
721 | return 1; | 722 | return 1; |
722 | } | 723 | } |
723 | 724 | ||
725 | static void clock_was_set_work(struct work_struct *work) | ||
726 | { | ||
727 | clock_was_set(); | ||
728 | } | ||
729 | |||
730 | static DECLARE_WORK(hrtimer_work, clock_was_set_work); | ||
731 | |||
724 | /* | 732 | /* |
725 | * Called from timekeeping code to reprogramm the hrtimer interrupt | 733 | * Called from timekeeping and resume code to reprogramm the hrtimer |
726 | * device. If called from the timer interrupt context we defer it to | 734 | * interrupt device on all cpus. |
727 | * softirq context. | ||
728 | */ | 735 | */ |
729 | void clock_was_set_delayed(void) | 736 | void clock_was_set_delayed(void) |
730 | { | 737 | { |
731 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 738 | schedule_work(&hrtimer_work); |
732 | |||
733 | cpu_base->clock_was_set = 1; | ||
734 | __raise_softirq_irqoff(HRTIMER_SOFTIRQ); | ||
735 | } | 739 | } |
736 | 740 | ||
737 | #else | 741 | #else |
@@ -773,15 +777,19 @@ void clock_was_set(void) | |||
773 | 777 | ||
774 | /* | 778 | /* |
775 | * During resume we might have to reprogram the high resolution timer | 779 | * During resume we might have to reprogram the high resolution timer |
776 | * interrupt (on the local CPU): | 780 | * interrupt on all online CPUs. However, all other CPUs will be |
781 | * stopped with IRQs interrupts disabled so the clock_was_set() call | ||
782 | * must be deferred. | ||
777 | */ | 783 | */ |
778 | void hrtimers_resume(void) | 784 | void hrtimers_resume(void) |
779 | { | 785 | { |
780 | WARN_ONCE(!irqs_disabled(), | 786 | WARN_ONCE(!irqs_disabled(), |
781 | KERN_INFO "hrtimers_resume() called with IRQs enabled!"); | 787 | KERN_INFO "hrtimers_resume() called with IRQs enabled!"); |
782 | 788 | ||
789 | /* Retrigger on the local CPU */ | ||
783 | retrigger_next_event(NULL); | 790 | retrigger_next_event(NULL); |
784 | timerfd_clock_was_set(); | 791 | /* And schedule a retrigger for all others */ |
792 | clock_was_set_delayed(); | ||
785 | } | 793 | } |
786 | 794 | ||
787 | static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) | 795 | static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) |
@@ -1432,13 +1440,6 @@ void hrtimer_peek_ahead_timers(void) | |||
1432 | 1440 | ||
1433 | static void run_hrtimer_softirq(struct softirq_action *h) | 1441 | static void run_hrtimer_softirq(struct softirq_action *h) |
1434 | { | 1442 | { |
1435 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
1436 | |||
1437 | if (cpu_base->clock_was_set) { | ||
1438 | cpu_base->clock_was_set = 0; | ||
1439 | clock_was_set(); | ||
1440 | } | ||
1441 | |||
1442 | hrtimer_peek_ahead_timers(); | 1443 | hrtimer_peek_ahead_timers(); |
1443 | } | 1444 | } |
1444 | 1445 | ||
@@ -1545,7 +1546,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod | |||
1545 | t->task = NULL; | 1546 | t->task = NULL; |
1546 | 1547 | ||
1547 | if (likely(t->task)) | 1548 | if (likely(t->task)) |
1548 | schedule(); | 1549 | freezable_schedule(); |
1549 | 1550 | ||
1550 | hrtimer_cancel(&t->timer); | 1551 | hrtimer_cancel(&t->timer); |
1551 | mode = HRTIMER_MODE_ABS; | 1552 | mode = HRTIMER_MODE_ABS; |
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 1c39eccc1eaf..10e663ab1f4a 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c | |||
@@ -135,7 +135,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d) | |||
135 | } | 135 | } |
136 | 136 | ||
137 | /** | 137 | /** |
138 | * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt | 138 | * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt |
139 | * @d: irq_data | 139 | * @d: irq_data |
140 | */ | 140 | */ |
141 | void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) | 141 | void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) |
@@ -275,10 +275,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, | |||
275 | if (d->gc) | 275 | if (d->gc) |
276 | return -EBUSY; | 276 | return -EBUSY; |
277 | 277 | ||
278 | if (d->revmap_type != IRQ_DOMAIN_MAP_LINEAR) | 278 | numchips = d->revmap_size / irqs_per_chip; |
279 | return -EINVAL; | ||
280 | |||
281 | numchips = d->revmap_data.linear.size / irqs_per_chip; | ||
282 | if (!numchips) | 279 | if (!numchips) |
283 | return -EINVAL; | 280 | return -EINVAL; |
284 | 281 | ||
@@ -310,6 +307,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, | |||
310 | /* Calc pointer to the next generic chip */ | 307 | /* Calc pointer to the next generic chip */ |
311 | tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); | 308 | tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); |
312 | } | 309 | } |
310 | d->name = name; | ||
313 | return 0; | 311 | return 0; |
314 | } | 312 | } |
315 | EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); | 313 | EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1ed8dff17eb9..2d7cd3428365 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -23,9 +23,11 @@ static DEFINE_MUTEX(revmap_trees_mutex); | |||
23 | static struct irq_domain *irq_default_domain; | 23 | static struct irq_domain *irq_default_domain; |
24 | 24 | ||
25 | /** | 25 | /** |
26 | * irq_domain_alloc() - Allocate a new irq_domain data structure | 26 | * __irq_domain_add() - Allocate a new irq_domain data structure |
27 | * @of_node: optional device-tree node of the interrupt controller | 27 | * @of_node: optional device-tree node of the interrupt controller |
28 | * @revmap_type: type of reverse mapping to use | 28 | * @size: Size of linear map; 0 for radix mapping only |
29 | * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no | ||
30 | * direct mapping | ||
29 | * @ops: map/unmap domain callbacks | 31 | * @ops: map/unmap domain callbacks |
30 | * @host_data: Controller private data pointer | 32 | * @host_data: Controller private data pointer |
31 | * | 33 | * |
@@ -33,41 +35,35 @@ static struct irq_domain *irq_default_domain; | |||
33 | * register allocated irq_domain with irq_domain_register(). Returns pointer | 35 | * register allocated irq_domain with irq_domain_register(). Returns pointer |
34 | * to IRQ domain, or NULL on failure. | 36 | * to IRQ domain, or NULL on failure. |
35 | */ | 37 | */ |
36 | static struct irq_domain *irq_domain_alloc(struct device_node *of_node, | 38 | struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, |
37 | unsigned int revmap_type, | 39 | irq_hw_number_t hwirq_max, int direct_max, |
38 | const struct irq_domain_ops *ops, | 40 | const struct irq_domain_ops *ops, |
39 | void *host_data) | 41 | void *host_data) |
40 | { | 42 | { |
41 | struct irq_domain *domain; | 43 | struct irq_domain *domain; |
42 | 44 | ||
43 | domain = kzalloc_node(sizeof(*domain), GFP_KERNEL, | 45 | domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), |
44 | of_node_to_nid(of_node)); | 46 | GFP_KERNEL, of_node_to_nid(of_node)); |
45 | if (WARN_ON(!domain)) | 47 | if (WARN_ON(!domain)) |
46 | return NULL; | 48 | return NULL; |
47 | 49 | ||
48 | /* Fill structure */ | 50 | /* Fill structure */ |
49 | domain->revmap_type = revmap_type; | 51 | INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); |
50 | domain->ops = ops; | 52 | domain->ops = ops; |
51 | domain->host_data = host_data; | 53 | domain->host_data = host_data; |
52 | domain->of_node = of_node_get(of_node); | 54 | domain->of_node = of_node_get(of_node); |
55 | domain->hwirq_max = hwirq_max; | ||
56 | domain->revmap_size = size; | ||
57 | domain->revmap_direct_max_irq = direct_max; | ||
53 | 58 | ||
54 | return domain; | ||
55 | } | ||
56 | |||
57 | static void irq_domain_free(struct irq_domain *domain) | ||
58 | { | ||
59 | of_node_put(domain->of_node); | ||
60 | kfree(domain); | ||
61 | } | ||
62 | |||
63 | static void irq_domain_add(struct irq_domain *domain) | ||
64 | { | ||
65 | mutex_lock(&irq_domain_mutex); | 59 | mutex_lock(&irq_domain_mutex); |
66 | list_add(&domain->link, &irq_domain_list); | 60 | list_add(&domain->link, &irq_domain_list); |
67 | mutex_unlock(&irq_domain_mutex); | 61 | mutex_unlock(&irq_domain_mutex); |
68 | pr_debug("Allocated domain of type %d @0x%p\n", | 62 | |
69 | domain->revmap_type, domain); | 63 | pr_debug("Added domain %s\n", domain->name); |
64 | return domain; | ||
70 | } | 65 | } |
66 | EXPORT_SYMBOL_GPL(__irq_domain_add); | ||
71 | 67 | ||
72 | /** | 68 | /** |
73 | * irq_domain_remove() - Remove an irq domain. | 69 | * irq_domain_remove() - Remove an irq domain. |
@@ -81,29 +77,12 @@ void irq_domain_remove(struct irq_domain *domain) | |||
81 | { | 77 | { |
82 | mutex_lock(&irq_domain_mutex); | 78 | mutex_lock(&irq_domain_mutex); |
83 | 79 | ||
84 | switch (domain->revmap_type) { | 80 | /* |
85 | case IRQ_DOMAIN_MAP_LEGACY: | 81 | * radix_tree_delete() takes care of destroying the root |
86 | /* | 82 | * node when all entries are removed. Shout if there are |
87 | * Legacy domains don't manage their own irq_desc | 83 | * any mappings left. |
88 | * allocations, we expect the caller to handle irq_desc | 84 | */ |
89 | * freeing on their own. | 85 | WARN_ON(domain->revmap_tree.height); |
90 | */ | ||
91 | break; | ||
92 | case IRQ_DOMAIN_MAP_TREE: | ||
93 | /* | ||
94 | * radix_tree_delete() takes care of destroying the root | ||
95 | * node when all entries are removed. Shout if there are | ||
96 | * any mappings left. | ||
97 | */ | ||
98 | WARN_ON(domain->revmap_data.tree.height); | ||
99 | break; | ||
100 | case IRQ_DOMAIN_MAP_LINEAR: | ||
101 | kfree(domain->revmap_data.linear.revmap); | ||
102 | domain->revmap_data.linear.size = 0; | ||
103 | break; | ||
104 | case IRQ_DOMAIN_MAP_NOMAP: | ||
105 | break; | ||
106 | } | ||
107 | 86 | ||
108 | list_del(&domain->link); | 87 | list_del(&domain->link); |
109 | 88 | ||
@@ -115,44 +94,30 @@ void irq_domain_remove(struct irq_domain *domain) | |||
115 | 94 | ||
116 | mutex_unlock(&irq_domain_mutex); | 95 | mutex_unlock(&irq_domain_mutex); |
117 | 96 | ||
118 | pr_debug("Removed domain of type %d @0x%p\n", | 97 | pr_debug("Removed domain %s\n", domain->name); |
119 | domain->revmap_type, domain); | ||
120 | 98 | ||
121 | irq_domain_free(domain); | 99 | of_node_put(domain->of_node); |
100 | kfree(domain); | ||
122 | } | 101 | } |
123 | EXPORT_SYMBOL_GPL(irq_domain_remove); | 102 | EXPORT_SYMBOL_GPL(irq_domain_remove); |
124 | 103 | ||
125 | static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, | ||
126 | irq_hw_number_t hwirq) | ||
127 | { | ||
128 | irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq; | ||
129 | int size = domain->revmap_data.legacy.size; | ||
130 | |||
131 | if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size)) | ||
132 | return 0; | ||
133 | return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq; | ||
134 | } | ||
135 | |||
136 | /** | 104 | /** |
137 | * irq_domain_add_simple() - Allocate and register a simple irq_domain. | 105 | * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs |
138 | * @of_node: pointer to interrupt controller's device tree node. | 106 | * @of_node: pointer to interrupt controller's device tree node. |
139 | * @size: total number of irqs in mapping | 107 | * @size: total number of irqs in mapping |
140 | * @first_irq: first number of irq block assigned to the domain, | 108 | * @first_irq: first number of irq block assigned to the domain, |
141 | * pass zero to assign irqs on-the-fly. This will result in a | 109 | * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then |
142 | * linear IRQ domain so it is important to use irq_create_mapping() | 110 | * pre-map all of the irqs in the domain to virqs starting at first_irq. |
143 | * for each used IRQ, especially when SPARSE_IRQ is enabled. | ||
144 | * @ops: map/unmap domain callbacks | 111 | * @ops: map/unmap domain callbacks |
145 | * @host_data: Controller private data pointer | 112 | * @host_data: Controller private data pointer |
146 | * | 113 | * |
147 | * Allocates a legacy irq_domain if irq_base is positive or a linear | 114 | * Allocates an irq_domain, and optionally if first_irq is positive then also |
148 | * domain otherwise. For the legacy domain, IRQ descriptors will also | 115 | * allocate irq_descs and map all of the hwirqs to virqs starting at first_irq. |
149 | * be allocated. | ||
150 | * | 116 | * |
151 | * This is intended to implement the expected behaviour for most | 117 | * This is intended to implement the expected behaviour for most |
152 | * interrupt controllers which is that a linear mapping should | 118 | * interrupt controllers. If device tree is used, then first_irq will be 0 and |
153 | * normally be used unless the system requires a legacy mapping in | 119 | * irqs get mapped dynamically on the fly. However, if the controller requires |
154 | * order to support supplying interrupt numbers during non-DT | 120 | * static virq assignments (non-DT boot) then it will set that up correctly. |
155 | * registration of devices. | ||
156 | */ | 121 | */ |
157 | struct irq_domain *irq_domain_add_simple(struct device_node *of_node, | 122 | struct irq_domain *irq_domain_add_simple(struct device_node *of_node, |
158 | unsigned int size, | 123 | unsigned int size, |
@@ -160,33 +125,25 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, | |||
160 | const struct irq_domain_ops *ops, | 125 | const struct irq_domain_ops *ops, |
161 | void *host_data) | 126 | void *host_data) |
162 | { | 127 | { |
163 | if (first_irq > 0) { | 128 | struct irq_domain *domain; |
164 | int irq_base; | 129 | |
130 | domain = __irq_domain_add(of_node, size, size, 0, ops, host_data); | ||
131 | if (!domain) | ||
132 | return NULL; | ||
165 | 133 | ||
134 | if (first_irq > 0) { | ||
166 | if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { | 135 | if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { |
167 | /* | 136 | /* attempt to allocated irq_descs */ |
168 | * Set the descriptor allocator to search for a | 137 | int rc = irq_alloc_descs(first_irq, first_irq, size, |
169 | * 1-to-1 mapping, such as irq_alloc_desc_at(). | 138 | of_node_to_nid(of_node)); |
170 | * Use of_node_to_nid() which is defined to | 139 | if (rc < 0) |
171 | * numa_node_id() on platforms that have no custom | ||
172 | * implementation. | ||
173 | */ | ||
174 | irq_base = irq_alloc_descs(first_irq, first_irq, size, | ||
175 | of_node_to_nid(of_node)); | ||
176 | if (irq_base < 0) { | ||
177 | pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", | 140 | pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", |
178 | first_irq); | 141 | first_irq); |
179 | irq_base = first_irq; | 142 | } |
180 | } | 143 | irq_domain_associate_many(domain, first_irq, 0, size); |
181 | } else | ||
182 | irq_base = first_irq; | ||
183 | |||
184 | return irq_domain_add_legacy(of_node, size, irq_base, 0, | ||
185 | ops, host_data); | ||
186 | } | 144 | } |
187 | 145 | ||
188 | /* A linear domain is the default */ | 146 | return domain; |
189 | return irq_domain_add_linear(of_node, size, ops, host_data); | ||
190 | } | 147 | } |
191 | EXPORT_SYMBOL_GPL(irq_domain_add_simple); | 148 | EXPORT_SYMBOL_GPL(irq_domain_add_simple); |
192 | 149 | ||
@@ -213,131 +170,19 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, | |||
213 | void *host_data) | 170 | void *host_data) |
214 | { | 171 | { |
215 | struct irq_domain *domain; | 172 | struct irq_domain *domain; |
216 | unsigned int i; | ||
217 | 173 | ||
218 | domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data); | 174 | domain = __irq_domain_add(of_node, first_hwirq + size, |
175 | first_hwirq + size, 0, ops, host_data); | ||
219 | if (!domain) | 176 | if (!domain) |
220 | return NULL; | 177 | return NULL; |
221 | 178 | ||
222 | domain->revmap_data.legacy.first_irq = first_irq; | 179 | irq_domain_associate_many(domain, first_irq, first_hwirq, size); |
223 | domain->revmap_data.legacy.first_hwirq = first_hwirq; | ||
224 | domain->revmap_data.legacy.size = size; | ||
225 | 180 | ||
226 | mutex_lock(&irq_domain_mutex); | ||
227 | /* Verify that all the irqs are available */ | ||
228 | for (i = 0; i < size; i++) { | ||
229 | int irq = first_irq + i; | ||
230 | struct irq_data *irq_data = irq_get_irq_data(irq); | ||
231 | |||
232 | if (WARN_ON(!irq_data || irq_data->domain)) { | ||
233 | mutex_unlock(&irq_domain_mutex); | ||
234 | irq_domain_free(domain); | ||
235 | return NULL; | ||
236 | } | ||
237 | } | ||
238 | |||
239 | /* Claim all of the irqs before registering a legacy domain */ | ||
240 | for (i = 0; i < size; i++) { | ||
241 | struct irq_data *irq_data = irq_get_irq_data(first_irq + i); | ||
242 | irq_data->hwirq = first_hwirq + i; | ||
243 | irq_data->domain = domain; | ||
244 | } | ||
245 | mutex_unlock(&irq_domain_mutex); | ||
246 | |||
247 | for (i = 0; i < size; i++) { | ||
248 | int irq = first_irq + i; | ||
249 | int hwirq = first_hwirq + i; | ||
250 | |||
251 | /* IRQ0 gets ignored */ | ||
252 | if (!irq) | ||
253 | continue; | ||
254 | |||
255 | /* Legacy flags are left to default at this point, | ||
256 | * one can then use irq_create_mapping() to | ||
257 | * explicitly change them | ||
258 | */ | ||
259 | if (ops->map) | ||
260 | ops->map(domain, irq, hwirq); | ||
261 | |||
262 | /* Clear norequest flags */ | ||
263 | irq_clear_status_flags(irq, IRQ_NOREQUEST); | ||
264 | } | ||
265 | |||
266 | irq_domain_add(domain); | ||
267 | return domain; | 181 | return domain; |
268 | } | 182 | } |
269 | EXPORT_SYMBOL_GPL(irq_domain_add_legacy); | 183 | EXPORT_SYMBOL_GPL(irq_domain_add_legacy); |
270 | 184 | ||
271 | /** | 185 | /** |
272 | * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain. | ||
273 | * @of_node: pointer to interrupt controller's device tree node. | ||
274 | * @size: Number of interrupts in the domain. | ||
275 | * @ops: map/unmap domain callbacks | ||
276 | * @host_data: Controller private data pointer | ||
277 | */ | ||
278 | struct irq_domain *irq_domain_add_linear(struct device_node *of_node, | ||
279 | unsigned int size, | ||
280 | const struct irq_domain_ops *ops, | ||
281 | void *host_data) | ||
282 | { | ||
283 | struct irq_domain *domain; | ||
284 | unsigned int *revmap; | ||
285 | |||
286 | revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL, | ||
287 | of_node_to_nid(of_node)); | ||
288 | if (WARN_ON(!revmap)) | ||
289 | return NULL; | ||
290 | |||
291 | domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data); | ||
292 | if (!domain) { | ||
293 | kfree(revmap); | ||
294 | return NULL; | ||
295 | } | ||
296 | domain->revmap_data.linear.size = size; | ||
297 | domain->revmap_data.linear.revmap = revmap; | ||
298 | irq_domain_add(domain); | ||
299 | return domain; | ||
300 | } | ||
301 | EXPORT_SYMBOL_GPL(irq_domain_add_linear); | ||
302 | |||
303 | struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, | ||
304 | unsigned int max_irq, | ||
305 | const struct irq_domain_ops *ops, | ||
306 | void *host_data) | ||
307 | { | ||
308 | struct irq_domain *domain = irq_domain_alloc(of_node, | ||
309 | IRQ_DOMAIN_MAP_NOMAP, ops, host_data); | ||
310 | if (domain) { | ||
311 | domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0; | ||
312 | irq_domain_add(domain); | ||
313 | } | ||
314 | return domain; | ||
315 | } | ||
316 | EXPORT_SYMBOL_GPL(irq_domain_add_nomap); | ||
317 | |||
318 | /** | ||
319 | * irq_domain_add_tree() | ||
320 | * @of_node: pointer to interrupt controller's device tree node. | ||
321 | * @ops: map/unmap domain callbacks | ||
322 | * | ||
323 | * Note: The radix tree will be allocated later during boot automatically | ||
324 | * (the reverse mapping will use the slow path until that happens). | ||
325 | */ | ||
326 | struct irq_domain *irq_domain_add_tree(struct device_node *of_node, | ||
327 | const struct irq_domain_ops *ops, | ||
328 | void *host_data) | ||
329 | { | ||
330 | struct irq_domain *domain = irq_domain_alloc(of_node, | ||
331 | IRQ_DOMAIN_MAP_TREE, ops, host_data); | ||
332 | if (domain) { | ||
333 | INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); | ||
334 | irq_domain_add(domain); | ||
335 | } | ||
336 | return domain; | ||
337 | } | ||
338 | EXPORT_SYMBOL_GPL(irq_domain_add_tree); | ||
339 | |||
340 | /** | ||
341 | * irq_find_host() - Locates a domain for a given device node | 186 | * irq_find_host() - Locates a domain for a given device node |
342 | * @node: device-tree node of the interrupt controller | 187 | * @node: device-tree node of the interrupt controller |
343 | */ | 188 | */ |
@@ -385,125 +230,108 @@ void irq_set_default_host(struct irq_domain *domain) | |||
385 | } | 230 | } |
386 | EXPORT_SYMBOL_GPL(irq_set_default_host); | 231 | EXPORT_SYMBOL_GPL(irq_set_default_host); |
387 | 232 | ||
388 | static void irq_domain_disassociate_many(struct irq_domain *domain, | 233 | static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) |
389 | unsigned int irq_base, int count) | ||
390 | { | 234 | { |
391 | /* | 235 | struct irq_data *irq_data = irq_get_irq_data(irq); |
392 | * disassociate in reverse order; | 236 | irq_hw_number_t hwirq; |
393 | * not strictly necessary, but nice for unwinding | ||
394 | */ | ||
395 | while (count--) { | ||
396 | int irq = irq_base + count; | ||
397 | struct irq_data *irq_data = irq_get_irq_data(irq); | ||
398 | irq_hw_number_t hwirq; | ||
399 | 237 | ||
400 | if (WARN_ON(!irq_data || irq_data->domain != domain)) | 238 | if (WARN(!irq_data || irq_data->domain != domain, |
401 | continue; | 239 | "virq%i doesn't exist; cannot disassociate\n", irq)) |
240 | return; | ||
402 | 241 | ||
403 | hwirq = irq_data->hwirq; | 242 | hwirq = irq_data->hwirq; |
404 | irq_set_status_flags(irq, IRQ_NOREQUEST); | 243 | irq_set_status_flags(irq, IRQ_NOREQUEST); |
405 | 244 | ||
406 | /* remove chip and handler */ | 245 | /* remove chip and handler */ |
407 | irq_set_chip_and_handler(irq, NULL, NULL); | 246 | irq_set_chip_and_handler(irq, NULL, NULL); |
408 | 247 | ||
409 | /* Make sure it's completed */ | 248 | /* Make sure it's completed */ |
410 | synchronize_irq(irq); | 249 | synchronize_irq(irq); |
411 | 250 | ||
412 | /* Tell the PIC about it */ | 251 | /* Tell the PIC about it */ |
413 | if (domain->ops->unmap) | 252 | if (domain->ops->unmap) |
414 | domain->ops->unmap(domain, irq); | 253 | domain->ops->unmap(domain, irq); |
415 | smp_mb(); | 254 | smp_mb(); |
416 | 255 | ||
417 | irq_data->domain = NULL; | 256 | irq_data->domain = NULL; |
418 | irq_data->hwirq = 0; | 257 | irq_data->hwirq = 0; |
419 | 258 | ||
420 | /* Clear reverse map */ | 259 | /* Clear reverse map for this hwirq */ |
421 | switch(domain->revmap_type) { | 260 | if (hwirq < domain->revmap_size) { |
422 | case IRQ_DOMAIN_MAP_LINEAR: | 261 | domain->linear_revmap[hwirq] = 0; |
423 | if (hwirq < domain->revmap_data.linear.size) | 262 | } else { |
424 | domain->revmap_data.linear.revmap[hwirq] = 0; | 263 | mutex_lock(&revmap_trees_mutex); |
425 | break; | 264 | radix_tree_delete(&domain->revmap_tree, hwirq); |
426 | case IRQ_DOMAIN_MAP_TREE: | 265 | mutex_unlock(&revmap_trees_mutex); |
427 | mutex_lock(&revmap_trees_mutex); | ||
428 | radix_tree_delete(&domain->revmap_data.tree, hwirq); | ||
429 | mutex_unlock(&revmap_trees_mutex); | ||
430 | break; | ||
431 | } | ||
432 | } | 266 | } |
433 | } | 267 | } |
434 | 268 | ||
435 | int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, | 269 | int irq_domain_associate(struct irq_domain *domain, unsigned int virq, |
436 | irq_hw_number_t hwirq_base, int count) | 270 | irq_hw_number_t hwirq) |
437 | { | 271 | { |
438 | unsigned int virq = irq_base; | 272 | struct irq_data *irq_data = irq_get_irq_data(virq); |
439 | irq_hw_number_t hwirq = hwirq_base; | 273 | int ret; |
440 | int i, ret; | ||
441 | 274 | ||
442 | pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, | 275 | if (WARN(hwirq >= domain->hwirq_max, |
443 | of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); | 276 | "error: hwirq 0x%x is too large for %s\n", (int)hwirq, domain->name)) |
277 | return -EINVAL; | ||
278 | if (WARN(!irq_data, "error: virq%i is not allocated", virq)) | ||
279 | return -EINVAL; | ||
280 | if (WARN(irq_data->domain, "error: virq%i is already associated", virq)) | ||
281 | return -EINVAL; | ||
444 | 282 | ||
445 | for (i = 0; i < count; i++) { | 283 | mutex_lock(&irq_domain_mutex); |
446 | struct irq_data *irq_data = irq_get_irq_data(virq + i); | 284 | irq_data->hwirq = hwirq; |
447 | 285 | irq_data->domain = domain; | |
448 | if (WARN(!irq_data, "error: irq_desc not allocated; " | 286 | if (domain->ops->map) { |
449 | "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) | 287 | ret = domain->ops->map(domain, virq, hwirq); |
450 | return -EINVAL; | 288 | if (ret != 0) { |
451 | if (WARN(irq_data->domain, "error: irq_desc already associated; " | 289 | /* |
452 | "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) | 290 | * If map() returns -EPERM, this interrupt is protected |
453 | return -EINVAL; | 291 | * by the firmware or some other service and shall not |
454 | }; | 292 | * be mapped. Don't bother telling the user about it. |
455 | 293 | */ | |
456 | for (i = 0; i < count; i++, virq++, hwirq++) { | 294 | if (ret != -EPERM) { |
457 | struct irq_data *irq_data = irq_get_irq_data(virq); | 295 | pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n", |
458 | 296 | domain->name, hwirq, virq, ret); | |
459 | irq_data->hwirq = hwirq; | ||
460 | irq_data->domain = domain; | ||
461 | if (domain->ops->map) { | ||
462 | ret = domain->ops->map(domain, virq, hwirq); | ||
463 | if (ret != 0) { | ||
464 | /* | ||
465 | * If map() returns -EPERM, this interrupt is protected | ||
466 | * by the firmware or some other service and shall not | ||
467 | * be mapped. | ||
468 | * | ||
469 | * Since on some platforms we blindly try to map everything | ||
470 | * we end up with a log full of backtraces. | ||
471 | * | ||
472 | * So instead, we silently fail on -EPERM, it is the | ||
473 | * responsibility of the PIC driver to display a relevant | ||
474 | * message if needed. | ||
475 | */ | ||
476 | if (ret != -EPERM) { | ||
477 | pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", | ||
478 | virq, hwirq, ret); | ||
479 | WARN_ON(1); | ||
480 | } | ||
481 | irq_data->domain = NULL; | ||
482 | irq_data->hwirq = 0; | ||
483 | goto err_unmap; | ||
484 | } | 297 | } |
298 | irq_data->domain = NULL; | ||
299 | irq_data->hwirq = 0; | ||
300 | mutex_unlock(&irq_domain_mutex); | ||
301 | return ret; | ||
485 | } | 302 | } |
486 | 303 | ||
487 | switch (domain->revmap_type) { | 304 | /* If not already assigned, give the domain the chip's name */ |
488 | case IRQ_DOMAIN_MAP_LINEAR: | 305 | if (!domain->name && irq_data->chip) |
489 | if (hwirq < domain->revmap_data.linear.size) | 306 | domain->name = irq_data->chip->name; |
490 | domain->revmap_data.linear.revmap[hwirq] = virq; | 307 | } |
491 | break; | ||
492 | case IRQ_DOMAIN_MAP_TREE: | ||
493 | mutex_lock(&revmap_trees_mutex); | ||
494 | radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); | ||
495 | mutex_unlock(&revmap_trees_mutex); | ||
496 | break; | ||
497 | } | ||
498 | 308 | ||
499 | irq_clear_status_flags(virq, IRQ_NOREQUEST); | 309 | if (hwirq < domain->revmap_size) { |
310 | domain->linear_revmap[hwirq] = virq; | ||
311 | } else { | ||
312 | mutex_lock(&revmap_trees_mutex); | ||
313 | radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); | ||
314 | mutex_unlock(&revmap_trees_mutex); | ||
500 | } | 315 | } |
316 | mutex_unlock(&irq_domain_mutex); | ||
317 | |||
318 | irq_clear_status_flags(virq, IRQ_NOREQUEST); | ||
501 | 319 | ||
502 | return 0; | 320 | return 0; |
321 | } | ||
322 | EXPORT_SYMBOL_GPL(irq_domain_associate); | ||
503 | 323 | ||
504 | err_unmap: | 324 | void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, |
505 | irq_domain_disassociate_many(domain, irq_base, i); | 325 | irq_hw_number_t hwirq_base, int count) |
506 | return -EINVAL; | 326 | { |
327 | int i; | ||
328 | |||
329 | pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, | ||
330 | of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); | ||
331 | |||
332 | for (i = 0; i < count; i++) { | ||
333 | irq_domain_associate(domain, irq_base + i, hwirq_base + i); | ||
334 | } | ||
507 | } | 335 | } |
508 | EXPORT_SYMBOL_GPL(irq_domain_associate_many); | 336 | EXPORT_SYMBOL_GPL(irq_domain_associate_many); |
509 | 337 | ||
@@ -513,7 +341,9 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many); | |||
513 | * | 341 | * |
514 | * This routine is used for irq controllers which can choose the hardware | 342 | * This routine is used for irq controllers which can choose the hardware |
515 | * interrupt numbers they generate. In such a case it's simplest to use | 343 | * interrupt numbers they generate. In such a case it's simplest to use |
516 | * the linux irq as the hardware interrupt number. | 344 | * the linux irq as the hardware interrupt number. It still uses the linear |
345 | * or radix tree to store the mapping, but the irq controller can optimize | ||
346 | * the revmap path by using the hwirq directly. | ||
517 | */ | 347 | */ |
518 | unsigned int irq_create_direct_mapping(struct irq_domain *domain) | 348 | unsigned int irq_create_direct_mapping(struct irq_domain *domain) |
519 | { | 349 | { |
@@ -522,17 +352,14 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) | |||
522 | if (domain == NULL) | 352 | if (domain == NULL) |
523 | domain = irq_default_domain; | 353 | domain = irq_default_domain; |
524 | 354 | ||
525 | if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP)) | ||
526 | return 0; | ||
527 | |||
528 | virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); | 355 | virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); |
529 | if (!virq) { | 356 | if (!virq) { |
530 | pr_debug("create_direct virq allocation failed\n"); | 357 | pr_debug("create_direct virq allocation failed\n"); |
531 | return 0; | 358 | return 0; |
532 | } | 359 | } |
533 | if (virq >= domain->revmap_data.nomap.max_irq) { | 360 | if (virq >= domain->revmap_direct_max_irq) { |
534 | pr_err("ERROR: no free irqs available below %i maximum\n", | 361 | pr_err("ERROR: no free irqs available below %i maximum\n", |
535 | domain->revmap_data.nomap.max_irq); | 362 | domain->revmap_direct_max_irq); |
536 | irq_free_desc(virq); | 363 | irq_free_desc(virq); |
537 | return 0; | 364 | return 0; |
538 | } | 365 | } |
@@ -569,9 +396,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, | |||
569 | if (domain == NULL) | 396 | if (domain == NULL) |
570 | domain = irq_default_domain; | 397 | domain = irq_default_domain; |
571 | if (domain == NULL) { | 398 | if (domain == NULL) { |
572 | pr_warning("irq_create_mapping called for" | 399 | WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq); |
573 | " NULL domain, hwirq=%lx\n", hwirq); | ||
574 | WARN_ON(1); | ||
575 | return 0; | 400 | return 0; |
576 | } | 401 | } |
577 | pr_debug("-> using domain @%p\n", domain); | 402 | pr_debug("-> using domain @%p\n", domain); |
@@ -583,10 +408,6 @@ unsigned int irq_create_mapping(struct irq_domain *domain, | |||
583 | return virq; | 408 | return virq; |
584 | } | 409 | } |
585 | 410 | ||
586 | /* Get a virtual interrupt number */ | ||
587 | if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) | ||
588 | return irq_domain_legacy_revmap(domain, hwirq); | ||
589 | |||
590 | /* Allocate a virtual interrupt number */ | 411 | /* Allocate a virtual interrupt number */ |
591 | hint = hwirq % nr_irqs; | 412 | hint = hwirq % nr_irqs; |
592 | if (hint == 0) | 413 | if (hint == 0) |
@@ -639,12 +460,7 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, | |||
639 | if (unlikely(ret < 0)) | 460 | if (unlikely(ret < 0)) |
640 | return ret; | 461 | return ret; |
641 | 462 | ||
642 | ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count); | 463 | irq_domain_associate_many(domain, irq_base, hwirq_base, count); |
643 | if (unlikely(ret < 0)) { | ||
644 | irq_free_descs(irq_base, count); | ||
645 | return ret; | ||
646 | } | ||
647 | |||
648 | return 0; | 464 | return 0; |
649 | } | 465 | } |
650 | EXPORT_SYMBOL_GPL(irq_create_strict_mappings); | 466 | EXPORT_SYMBOL_GPL(irq_create_strict_mappings); |
@@ -671,8 +487,8 @@ unsigned int irq_create_of_mapping(struct device_node *controller, | |||
671 | if (intsize > 0) | 487 | if (intsize > 0) |
672 | return intspec[0]; | 488 | return intspec[0]; |
673 | #endif | 489 | #endif |
674 | pr_warning("no irq domain found for %s !\n", | 490 | pr_warn("no irq domain found for %s !\n", |
675 | of_node_full_name(controller)); | 491 | of_node_full_name(controller)); |
676 | return 0; | 492 | return 0; |
677 | } | 493 | } |
678 | 494 | ||
@@ -714,11 +530,7 @@ void irq_dispose_mapping(unsigned int virq) | |||
714 | if (WARN_ON(domain == NULL)) | 530 | if (WARN_ON(domain == NULL)) |
715 | return; | 531 | return; |
716 | 532 | ||
717 | /* Never unmap legacy interrupts */ | 533 | irq_domain_disassociate(domain, virq); |
718 | if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) | ||
719 | return; | ||
720 | |||
721 | irq_domain_disassociate_many(domain, virq, 1); | ||
722 | irq_free_desc(virq); | 534 | irq_free_desc(virq); |
723 | } | 535 | } |
724 | EXPORT_SYMBOL_GPL(irq_dispose_mapping); | 536 | EXPORT_SYMBOL_GPL(irq_dispose_mapping); |
@@ -739,63 +551,51 @@ unsigned int irq_find_mapping(struct irq_domain *domain, | |||
739 | if (domain == NULL) | 551 | if (domain == NULL) |
740 | return 0; | 552 | return 0; |
741 | 553 | ||
742 | switch (domain->revmap_type) { | 554 | if (hwirq < domain->revmap_direct_max_irq) { |
743 | case IRQ_DOMAIN_MAP_LEGACY: | ||
744 | return irq_domain_legacy_revmap(domain, hwirq); | ||
745 | case IRQ_DOMAIN_MAP_LINEAR: | ||
746 | return irq_linear_revmap(domain, hwirq); | ||
747 | case IRQ_DOMAIN_MAP_TREE: | ||
748 | rcu_read_lock(); | ||
749 | data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); | ||
750 | rcu_read_unlock(); | ||
751 | if (data) | ||
752 | return data->irq; | ||
753 | break; | ||
754 | case IRQ_DOMAIN_MAP_NOMAP: | ||
755 | data = irq_get_irq_data(hwirq); | 555 | data = irq_get_irq_data(hwirq); |
756 | if (data && (data->domain == domain) && (data->hwirq == hwirq)) | 556 | if (data && (data->domain == domain) && (data->hwirq == hwirq)) |
757 | return hwirq; | 557 | return hwirq; |
758 | break; | ||
759 | } | 558 | } |
760 | 559 | ||
761 | return 0; | 560 | /* Check if the hwirq is in the linear revmap. */ |
762 | } | 561 | if (hwirq < domain->revmap_size) |
763 | EXPORT_SYMBOL_GPL(irq_find_mapping); | 562 | return domain->linear_revmap[hwirq]; |
764 | 563 | ||
765 | /** | 564 | rcu_read_lock(); |
766 | * irq_linear_revmap() - Find a linux irq from a hw irq number. | 565 | data = radix_tree_lookup(&domain->revmap_tree, hwirq); |
767 | * @domain: domain owning this hardware interrupt | 566 | rcu_read_unlock(); |
768 | * @hwirq: hardware irq number in that domain space | 567 | return data ? data->irq : 0; |
769 | * | ||
770 | * This is a fast path that can be called directly by irq controller code to | ||
771 | * save a handful of instructions. | ||
772 | */ | ||
773 | unsigned int irq_linear_revmap(struct irq_domain *domain, | ||
774 | irq_hw_number_t hwirq) | ||
775 | { | ||
776 | BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR); | ||
777 | |||
778 | /* Check revmap bounds; complain if exceeded */ | ||
779 | if (WARN_ON(hwirq >= domain->revmap_data.linear.size)) | ||
780 | return 0; | ||
781 | |||
782 | return domain->revmap_data.linear.revmap[hwirq]; | ||
783 | } | 568 | } |
784 | EXPORT_SYMBOL_GPL(irq_linear_revmap); | 569 | EXPORT_SYMBOL_GPL(irq_find_mapping); |
785 | 570 | ||
786 | #ifdef CONFIG_IRQ_DOMAIN_DEBUG | 571 | #ifdef CONFIG_IRQ_DOMAIN_DEBUG |
787 | static int virq_debug_show(struct seq_file *m, void *private) | 572 | static int virq_debug_show(struct seq_file *m, void *private) |
788 | { | 573 | { |
789 | unsigned long flags; | 574 | unsigned long flags; |
790 | struct irq_desc *desc; | 575 | struct irq_desc *desc; |
791 | const char *p; | 576 | struct irq_domain *domain; |
792 | static const char none[] = "none"; | 577 | struct radix_tree_iter iter; |
793 | void *data; | 578 | void *data, **slot; |
794 | int i; | 579 | int i; |
795 | 580 | ||
796 | seq_printf(m, "%-5s %-7s %-15s %-*s %s\n", "irq", "hwirq", | 581 | seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", |
582 | "name", "mapped", "linear-max", "direct-max", "devtree-node"); | ||
583 | mutex_lock(&irq_domain_mutex); | ||
584 | list_for_each_entry(domain, &irq_domain_list, link) { | ||
585 | int count = 0; | ||
586 | radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) | ||
587 | count++; | ||
588 | seq_printf(m, "%c%-16s %6u %10u %10u %s\n", | ||
589 | domain == irq_default_domain ? '*' : ' ', domain->name, | ||
590 | domain->revmap_size + count, domain->revmap_size, | ||
591 | domain->revmap_direct_max_irq, | ||
592 | domain->of_node ? of_node_full_name(domain->of_node) : ""); | ||
593 | } | ||
594 | mutex_unlock(&irq_domain_mutex); | ||
595 | |||
596 | seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq", | ||
797 | "chip name", (int)(2 * sizeof(void *) + 2), "chip data", | 597 | "chip name", (int)(2 * sizeof(void *) + 2), "chip data", |
798 | "domain name"); | 598 | "active", "type", "domain"); |
799 | 599 | ||
800 | for (i = 1; i < nr_irqs; i++) { | 600 | for (i = 1; i < nr_irqs; i++) { |
801 | desc = irq_to_desc(i); | 601 | desc = irq_to_desc(i); |
@@ -803,28 +603,28 @@ static int virq_debug_show(struct seq_file *m, void *private) | |||
803 | continue; | 603 | continue; |
804 | 604 | ||
805 | raw_spin_lock_irqsave(&desc->lock, flags); | 605 | raw_spin_lock_irqsave(&desc->lock, flags); |
606 | domain = desc->irq_data.domain; | ||
806 | 607 | ||
807 | if (desc->action && desc->action->handler) { | 608 | if (domain) { |
808 | struct irq_chip *chip; | 609 | struct irq_chip *chip; |
610 | int hwirq = desc->irq_data.hwirq; | ||
611 | bool direct; | ||
809 | 612 | ||
810 | seq_printf(m, "%5d ", i); | 613 | seq_printf(m, "%5d ", i); |
811 | seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); | 614 | seq_printf(m, "0x%05x ", hwirq); |
812 | 615 | ||
813 | chip = irq_desc_get_chip(desc); | 616 | chip = irq_desc_get_chip(desc); |
814 | if (chip && chip->name) | 617 | seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); |
815 | p = chip->name; | ||
816 | else | ||
817 | p = none; | ||
818 | seq_printf(m, "%-15s ", p); | ||
819 | 618 | ||
820 | data = irq_desc_get_chip_data(desc); | 619 | data = irq_desc_get_chip_data(desc); |
821 | seq_printf(m, data ? "0x%p " : " %p ", data); | 620 | seq_printf(m, data ? "0x%p " : " %p ", data); |
822 | 621 | ||
823 | if (desc->irq_data.domain) | 622 | seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); |
824 | p = of_node_full_name(desc->irq_data.domain->of_node); | 623 | direct = (i == hwirq) && (i < domain->revmap_direct_max_irq); |
825 | else | 624 | seq_printf(m, "%6s%-8s ", |
826 | p = none; | 625 | (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", |
827 | seq_printf(m, "%s\n", p); | 626 | direct ? "(DIRECT)" : ""); |
627 | seq_printf(m, "%s\n", desc->irq_data.domain->name); | ||
828 | } | 628 | } |
829 | 629 | ||
830 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 630 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
@@ -921,18 +721,3 @@ const struct irq_domain_ops irq_domain_simple_ops = { | |||
921 | .xlate = irq_domain_xlate_onetwocell, | 721 | .xlate = irq_domain_xlate_onetwocell, |
922 | }; | 722 | }; |
923 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); | 723 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); |
924 | |||
925 | #ifdef CONFIG_OF_IRQ | ||
926 | void irq_domain_generate_simple(const struct of_device_id *match, | ||
927 | u64 phys_base, unsigned int irq_start) | ||
928 | { | ||
929 | struct device_node *node; | ||
930 | pr_debug("looking for phys_base=%llx, irq_start=%i\n", | ||
931 | (unsigned long long) phys_base, (int) irq_start); | ||
932 | node = of_find_matching_node_by_address(NULL, match, phys_base); | ||
933 | if (node) | ||
934 | irq_domain_add_legacy(node, 32, irq_start, 0, | ||
935 | &irq_domain_simple_ops, NULL); | ||
936 | } | ||
937 | EXPORT_SYMBOL_GPL(irq_domain_generate_simple); | ||
938 | #endif | ||
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 19ed5c425c3b..36f6ee181b0c 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -462,6 +462,8 @@ int show_interrupts(struct seq_file *p, void *v) | |||
462 | } else { | 462 | } else { |
463 | seq_printf(p, " %8s", "None"); | 463 | seq_printf(p, " %8s", "None"); |
464 | } | 464 | } |
465 | if (desc->irq_data.domain) | ||
466 | seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq); | ||
465 | #ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL | 467 | #ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL |
466 | seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); | 468 | seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); |
467 | #endif | 469 | #endif |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 8241906c4b61..fb326365b694 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -147,6 +147,9 @@ int __request_module(bool wait, const char *fmt, ...) | |||
147 | */ | 147 | */ |
148 | WARN_ON_ONCE(wait && current_is_async()); | 148 | WARN_ON_ONCE(wait && current_is_async()); |
149 | 149 | ||
150 | if (!modprobe_path[0]) | ||
151 | return 0; | ||
152 | |||
150 | va_start(args, fmt); | 153 | va_start(args, fmt); |
151 | ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); | 154 | ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); |
152 | va_end(args); | 155 | va_end(args); |
@@ -569,14 +572,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) | |||
569 | int retval = 0; | 572 | int retval = 0; |
570 | 573 | ||
571 | helper_lock(); | 574 | helper_lock(); |
572 | if (!sub_info->path) { | ||
573 | retval = -EINVAL; | ||
574 | goto out; | ||
575 | } | ||
576 | |||
577 | if (sub_info->path[0] == '\0') | ||
578 | goto out; | ||
579 | |||
580 | if (!khelper_wq || usermodehelper_disabled) { | 575 | if (!khelper_wq || usermodehelper_disabled) { |
581 | retval = -EBUSY; | 576 | retval = -EBUSY; |
582 | goto out; | 577 | goto out; |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index bddf3b201a48..6e33498d665c 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -2332,6 +2332,7 @@ static ssize_t write_enabled_file_bool(struct file *file, | |||
2332 | if (copy_from_user(buf, user_buf, buf_size)) | 2332 | if (copy_from_user(buf, user_buf, buf_size)) |
2333 | return -EFAULT; | 2333 | return -EFAULT; |
2334 | 2334 | ||
2335 | buf[buf_size] = '\0'; | ||
2335 | switch (buf[0]) { | 2336 | switch (buf[0]) { |
2336 | case 'y': | 2337 | case 'y': |
2337 | case 'Y': | 2338 | case 'Y': |
@@ -2343,6 +2344,8 @@ static ssize_t write_enabled_file_bool(struct file *file, | |||
2343 | case '0': | 2344 | case '0': |
2344 | disarm_all_kprobes(); | 2345 | disarm_all_kprobes(); |
2345 | break; | 2346 | break; |
2347 | default: | ||
2348 | return -EINVAL; | ||
2346 | } | 2349 | } |
2347 | 2350 | ||
2348 | return count; | 2351 | return count; |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 1f3186b37fd5..e16c45b9ee77 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -4090,7 +4090,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) | |||
4090 | } | 4090 | } |
4091 | EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); | 4091 | EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); |
4092 | 4092 | ||
4093 | static void print_held_locks_bug(struct task_struct *curr) | 4093 | static void print_held_locks_bug(void) |
4094 | { | 4094 | { |
4095 | if (!debug_locks_off()) | 4095 | if (!debug_locks_off()) |
4096 | return; | 4096 | return; |
@@ -4099,22 +4099,21 @@ static void print_held_locks_bug(struct task_struct *curr) | |||
4099 | 4099 | ||
4100 | printk("\n"); | 4100 | printk("\n"); |
4101 | printk("=====================================\n"); | 4101 | printk("=====================================\n"); |
4102 | printk("[ BUG: lock held at task exit time! ]\n"); | 4102 | printk("[ BUG: %s/%d still has locks held! ]\n", |
4103 | current->comm, task_pid_nr(current)); | ||
4103 | print_kernel_ident(); | 4104 | print_kernel_ident(); |
4104 | printk("-------------------------------------\n"); | 4105 | printk("-------------------------------------\n"); |
4105 | printk("%s/%d is exiting with locks still held!\n", | 4106 | lockdep_print_held_locks(current); |
4106 | curr->comm, task_pid_nr(curr)); | ||
4107 | lockdep_print_held_locks(curr); | ||
4108 | |||
4109 | printk("\nstack backtrace:\n"); | 4107 | printk("\nstack backtrace:\n"); |
4110 | dump_stack(); | 4108 | dump_stack(); |
4111 | } | 4109 | } |
4112 | 4110 | ||
4113 | void debug_check_no_locks_held(struct task_struct *task) | 4111 | void debug_check_no_locks_held(void) |
4114 | { | 4112 | { |
4115 | if (unlikely(task->lockdep_depth > 0)) | 4113 | if (unlikely(current->lockdep_depth > 0)) |
4116 | print_held_locks_bug(task); | 4114 | print_held_locks_bug(); |
4117 | } | 4115 | } |
4116 | EXPORT_SYMBOL_GPL(debug_check_no_locks_held); | ||
4118 | 4117 | ||
4119 | void debug_show_all_locks(void) | 4118 | void debug_show_all_locks(void) |
4120 | { | 4119 | { |
diff --git a/kernel/module.c b/kernel/module.c index cab4bce49c23..206915830d29 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -455,7 +455,7 @@ const struct kernel_symbol *find_symbol(const char *name, | |||
455 | EXPORT_SYMBOL_GPL(find_symbol); | 455 | EXPORT_SYMBOL_GPL(find_symbol); |
456 | 456 | ||
457 | /* Search for module by name: must hold module_mutex. */ | 457 | /* Search for module by name: must hold module_mutex. */ |
458 | static struct module *find_module_all(const char *name, | 458 | static struct module *find_module_all(const char *name, size_t len, |
459 | bool even_unformed) | 459 | bool even_unformed) |
460 | { | 460 | { |
461 | struct module *mod; | 461 | struct module *mod; |
@@ -463,7 +463,7 @@ static struct module *find_module_all(const char *name, | |||
463 | list_for_each_entry(mod, &modules, list) { | 463 | list_for_each_entry(mod, &modules, list) { |
464 | if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) | 464 | if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) |
465 | continue; | 465 | continue; |
466 | if (strcmp(mod->name, name) == 0) | 466 | if (strlen(mod->name) == len && !memcmp(mod->name, name, len)) |
467 | return mod; | 467 | return mod; |
468 | } | 468 | } |
469 | return NULL; | 469 | return NULL; |
@@ -471,7 +471,7 @@ static struct module *find_module_all(const char *name, | |||
471 | 471 | ||
472 | struct module *find_module(const char *name) | 472 | struct module *find_module(const char *name) |
473 | { | 473 | { |
474 | return find_module_all(name, false); | 474 | return find_module_all(name, strlen(name), false); |
475 | } | 475 | } |
476 | EXPORT_SYMBOL_GPL(find_module); | 476 | EXPORT_SYMBOL_GPL(find_module); |
477 | 477 | ||
@@ -482,23 +482,28 @@ static inline void __percpu *mod_percpu(struct module *mod) | |||
482 | return mod->percpu; | 482 | return mod->percpu; |
483 | } | 483 | } |
484 | 484 | ||
485 | static int percpu_modalloc(struct module *mod, | 485 | static int percpu_modalloc(struct module *mod, struct load_info *info) |
486 | unsigned long size, unsigned long align) | ||
487 | { | 486 | { |
487 | Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu]; | ||
488 | unsigned long align = pcpusec->sh_addralign; | ||
489 | |||
490 | if (!pcpusec->sh_size) | ||
491 | return 0; | ||
492 | |||
488 | if (align > PAGE_SIZE) { | 493 | if (align > PAGE_SIZE) { |
489 | printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", | 494 | printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", |
490 | mod->name, align, PAGE_SIZE); | 495 | mod->name, align, PAGE_SIZE); |
491 | align = PAGE_SIZE; | 496 | align = PAGE_SIZE; |
492 | } | 497 | } |
493 | 498 | ||
494 | mod->percpu = __alloc_reserved_percpu(size, align); | 499 | mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align); |
495 | if (!mod->percpu) { | 500 | if (!mod->percpu) { |
496 | printk(KERN_WARNING | 501 | printk(KERN_WARNING |
497 | "%s: Could not allocate %lu bytes percpu data\n", | 502 | "%s: Could not allocate %lu bytes percpu data\n", |
498 | mod->name, size); | 503 | mod->name, (unsigned long)pcpusec->sh_size); |
499 | return -ENOMEM; | 504 | return -ENOMEM; |
500 | } | 505 | } |
501 | mod->percpu_size = size; | 506 | mod->percpu_size = pcpusec->sh_size; |
502 | return 0; | 507 | return 0; |
503 | } | 508 | } |
504 | 509 | ||
@@ -563,10 +568,12 @@ static inline void __percpu *mod_percpu(struct module *mod) | |||
563 | { | 568 | { |
564 | return NULL; | 569 | return NULL; |
565 | } | 570 | } |
566 | static inline int percpu_modalloc(struct module *mod, | 571 | static int percpu_modalloc(struct module *mod, struct load_info *info) |
567 | unsigned long size, unsigned long align) | ||
568 | { | 572 | { |
569 | return -ENOMEM; | 573 | /* UP modules shouldn't have this section: ENOMEM isn't quite right */ |
574 | if (info->sechdrs[info->index.pcpu].sh_size != 0) | ||
575 | return -ENOMEM; | ||
576 | return 0; | ||
570 | } | 577 | } |
571 | static inline void percpu_modfree(struct module *mod) | 578 | static inline void percpu_modfree(struct module *mod) |
572 | { | 579 | { |
@@ -2927,7 +2934,6 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) | |||
2927 | { | 2934 | { |
2928 | /* Module within temporary copy. */ | 2935 | /* Module within temporary copy. */ |
2929 | struct module *mod; | 2936 | struct module *mod; |
2930 | Elf_Shdr *pcpusec; | ||
2931 | int err; | 2937 | int err; |
2932 | 2938 | ||
2933 | mod = setup_load_info(info, flags); | 2939 | mod = setup_load_info(info, flags); |
@@ -2942,17 +2948,10 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) | |||
2942 | err = module_frob_arch_sections(info->hdr, info->sechdrs, | 2948 | err = module_frob_arch_sections(info->hdr, info->sechdrs, |
2943 | info->secstrings, mod); | 2949 | info->secstrings, mod); |
2944 | if (err < 0) | 2950 | if (err < 0) |
2945 | goto out; | 2951 | return ERR_PTR(err); |
2946 | 2952 | ||
2947 | pcpusec = &info->sechdrs[info->index.pcpu]; | 2953 | /* We will do a special allocation for per-cpu sections later. */ |
2948 | if (pcpusec->sh_size) { | 2954 | info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC; |
2949 | /* We have a special allocation for this section. */ | ||
2950 | err = percpu_modalloc(mod, | ||
2951 | pcpusec->sh_size, pcpusec->sh_addralign); | ||
2952 | if (err) | ||
2953 | goto out; | ||
2954 | pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC; | ||
2955 | } | ||
2956 | 2955 | ||
2957 | /* Determine total sizes, and put offsets in sh_entsize. For now | 2956 | /* Determine total sizes, and put offsets in sh_entsize. For now |
2958 | this is done generically; there doesn't appear to be any | 2957 | this is done generically; there doesn't appear to be any |
@@ -2963,17 +2962,12 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) | |||
2963 | /* Allocate and move to the final place */ | 2962 | /* Allocate and move to the final place */ |
2964 | err = move_module(mod, info); | 2963 | err = move_module(mod, info); |
2965 | if (err) | 2964 | if (err) |
2966 | goto free_percpu; | 2965 | return ERR_PTR(err); |
2967 | 2966 | ||
2968 | /* Module has been copied to its final place now: return it. */ | 2967 | /* Module has been copied to its final place now: return it. */ |
2969 | mod = (void *)info->sechdrs[info->index.mod].sh_addr; | 2968 | mod = (void *)info->sechdrs[info->index.mod].sh_addr; |
2970 | kmemleak_load_module(mod, info); | 2969 | kmemleak_load_module(mod, info); |
2971 | return mod; | 2970 | return mod; |
2972 | |||
2973 | free_percpu: | ||
2974 | percpu_modfree(mod); | ||
2975 | out: | ||
2976 | return ERR_PTR(err); | ||
2977 | } | 2971 | } |
2978 | 2972 | ||
2979 | /* mod is no longer valid after this! */ | 2973 | /* mod is no longer valid after this! */ |
@@ -3014,7 +3008,7 @@ static bool finished_loading(const char *name) | |||
3014 | bool ret; | 3008 | bool ret; |
3015 | 3009 | ||
3016 | mutex_lock(&module_mutex); | 3010 | mutex_lock(&module_mutex); |
3017 | mod = find_module_all(name, true); | 3011 | mod = find_module_all(name, strlen(name), true); |
3018 | ret = !mod || mod->state == MODULE_STATE_LIVE | 3012 | ret = !mod || mod->state == MODULE_STATE_LIVE |
3019 | || mod->state == MODULE_STATE_GOING; | 3013 | || mod->state == MODULE_STATE_GOING; |
3020 | mutex_unlock(&module_mutex); | 3014 | mutex_unlock(&module_mutex); |
@@ -3152,7 +3146,8 @@ static int add_unformed_module(struct module *mod) | |||
3152 | 3146 | ||
3153 | again: | 3147 | again: |
3154 | mutex_lock(&module_mutex); | 3148 | mutex_lock(&module_mutex); |
3155 | if ((old = find_module_all(mod->name, true)) != NULL) { | 3149 | old = find_module_all(mod->name, strlen(mod->name), true); |
3150 | if (old != NULL) { | ||
3156 | if (old->state == MODULE_STATE_COMING | 3151 | if (old->state == MODULE_STATE_COMING |
3157 | || old->state == MODULE_STATE_UNFORMED) { | 3152 | || old->state == MODULE_STATE_UNFORMED) { |
3158 | /* Wait in case it fails to load. */ | 3153 | /* Wait in case it fails to load. */ |
@@ -3198,6 +3193,17 @@ out: | |||
3198 | return err; | 3193 | return err; |
3199 | } | 3194 | } |
3200 | 3195 | ||
3196 | static int unknown_module_param_cb(char *param, char *val, const char *modname) | ||
3197 | { | ||
3198 | /* Check for magic 'dyndbg' arg */ | ||
3199 | int ret = ddebug_dyndbg_module_param_cb(param, val, modname); | ||
3200 | if (ret != 0) { | ||
3201 | printk(KERN_WARNING "%s: unknown parameter '%s' ignored\n", | ||
3202 | modname, param); | ||
3203 | } | ||
3204 | return 0; | ||
3205 | } | ||
3206 | |||
3201 | /* Allocate and load the module: note that size of section 0 is always | 3207 | /* Allocate and load the module: note that size of section 0 is always |
3202 | zero, and we rely on this for optional sections. */ | 3208 | zero, and we rely on this for optional sections. */ |
3203 | static int load_module(struct load_info *info, const char __user *uargs, | 3209 | static int load_module(struct load_info *info, const char __user *uargs, |
@@ -3237,6 +3243,11 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
3237 | } | 3243 | } |
3238 | #endif | 3244 | #endif |
3239 | 3245 | ||
3246 | /* To avoid stressing percpu allocator, do this once we're unique. */ | ||
3247 | err = percpu_modalloc(mod, info); | ||
3248 | if (err) | ||
3249 | goto unlink_mod; | ||
3250 | |||
3240 | /* Now module is in final location, initialize linked lists, etc. */ | 3251 | /* Now module is in final location, initialize linked lists, etc. */ |
3241 | err = module_unload_init(mod); | 3252 | err = module_unload_init(mod); |
3242 | if (err) | 3253 | if (err) |
@@ -3284,7 +3295,7 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
3284 | 3295 | ||
3285 | /* Module is ready to execute: parsing args may do that. */ | 3296 | /* Module is ready to execute: parsing args may do that. */ |
3286 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, | 3297 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, |
3287 | -32768, 32767, &ddebug_dyndbg_module_param_cb); | 3298 | -32768, 32767, unknown_module_param_cb); |
3288 | if (err < 0) | 3299 | if (err < 0) |
3289 | goto bug_cleanup; | 3300 | goto bug_cleanup; |
3290 | 3301 | ||
@@ -3563,10 +3574,8 @@ unsigned long module_kallsyms_lookup_name(const char *name) | |||
3563 | /* Don't lock: we're in enough trouble already. */ | 3574 | /* Don't lock: we're in enough trouble already. */ |
3564 | preempt_disable(); | 3575 | preempt_disable(); |
3565 | if ((colon = strchr(name, ':')) != NULL) { | 3576 | if ((colon = strchr(name, ':')) != NULL) { |
3566 | *colon = '\0'; | 3577 | if ((mod = find_module_all(name, colon - name, false)) != NULL) |
3567 | if ((mod = find_module(name)) != NULL) | ||
3568 | ret = mod_find_symname(mod, colon+1); | 3578 | ret = mod_find_symname(mod, colon+1); |
3569 | *colon = ':'; | ||
3570 | } else { | 3579 | } else { |
3571 | list_for_each_entry_rcu(mod, &modules, list) { | 3580 | list_for_each_entry_rcu(mod, &modules, list) { |
3572 | if (mod->state == MODULE_STATE_UNFORMED) | 3581 | if (mod->state == MODULE_STATE_UNFORMED) |
diff --git a/kernel/panic.c b/kernel/panic.c index 167ec097ce8b..801864600514 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/notifier.h> | 15 | #include <linux/notifier.h> |
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/random.h> | 17 | #include <linux/random.h> |
18 | #include <linux/ftrace.h> | ||
18 | #include <linux/reboot.h> | 19 | #include <linux/reboot.h> |
19 | #include <linux/delay.h> | 20 | #include <linux/delay.h> |
20 | #include <linux/kexec.h> | 21 | #include <linux/kexec.h> |
@@ -399,8 +400,11 @@ struct slowpath_args { | |||
399 | static void warn_slowpath_common(const char *file, int line, void *caller, | 400 | static void warn_slowpath_common(const char *file, int line, void *caller, |
400 | unsigned taint, struct slowpath_args *args) | 401 | unsigned taint, struct slowpath_args *args) |
401 | { | 402 | { |
402 | printk(KERN_WARNING "------------[ cut here ]------------\n"); | 403 | disable_trace_on_warning(); |
403 | printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); | 404 | |
405 | pr_warn("------------[ cut here ]------------\n"); | ||
406 | pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n", | ||
407 | raw_smp_processor_id(), current->pid, file, line, caller); | ||
404 | 408 | ||
405 | if (args) | 409 | if (args) |
406 | vprintk(args->fmt, args->args); | 410 | vprintk(args->fmt, args->args); |
diff --git a/kernel/params.c b/kernel/params.c index 53b958fcd639..440e65d1a544 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -787,7 +787,7 @@ static void __init kernel_add_sysfs_param(const char *name, | |||
787 | } | 787 | } |
788 | 788 | ||
789 | /* | 789 | /* |
790 | * param_sysfs_builtin - add contents in /sys/parameters for built-in modules | 790 | * param_sysfs_builtin - add sysfs parameters for built-in modules |
791 | * | 791 | * |
792 | * Add module_parameters to sysfs for "modules" built into the kernel. | 792 | * Add module_parameters to sysfs for "modules" built into the kernel. |
793 | * | 793 | * |
diff --git a/kernel/pid.c b/kernel/pid.c index 0db3e791a06d..66505c1dfc51 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -75,6 +75,7 @@ struct pid_namespace init_pid_ns = { | |||
75 | [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } | 75 | [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } |
76 | }, | 76 | }, |
77 | .last_pid = 0, | 77 | .last_pid = 0, |
78 | .nr_hashed = PIDNS_HASH_ADDING, | ||
78 | .level = 0, | 79 | .level = 0, |
79 | .child_reaper = &init_task, | 80 | .child_reaper = &init_task, |
80 | .user_ns = &init_user_ns, | 81 | .user_ns = &init_user_ns, |
@@ -373,14 +374,10 @@ EXPORT_SYMBOL_GPL(find_vpid); | |||
373 | /* | 374 | /* |
374 | * attach_pid() must be called with the tasklist_lock write-held. | 375 | * attach_pid() must be called with the tasklist_lock write-held. |
375 | */ | 376 | */ |
376 | void attach_pid(struct task_struct *task, enum pid_type type, | 377 | void attach_pid(struct task_struct *task, enum pid_type type) |
377 | struct pid *pid) | ||
378 | { | 378 | { |
379 | struct pid_link *link; | 379 | struct pid_link *link = &task->pids[type]; |
380 | 380 | hlist_add_head_rcu(&link->node, &link->pid->tasks[type]); | |
381 | link = &task->pids[type]; | ||
382 | link->pid = pid; | ||
383 | hlist_add_head_rcu(&link->node, &pid->tasks[type]); | ||
384 | } | 381 | } |
385 | 382 | ||
386 | static void __change_pid(struct task_struct *task, enum pid_type type, | 383 | static void __change_pid(struct task_struct *task, enum pid_type type, |
@@ -412,7 +409,7 @@ void change_pid(struct task_struct *task, enum pid_type type, | |||
412 | struct pid *pid) | 409 | struct pid *pid) |
413 | { | 410 | { |
414 | __change_pid(task, type, pid); | 411 | __change_pid(task, type, pid); |
415 | attach_pid(task, type, pid); | 412 | attach_pid(task, type); |
416 | } | 413 | } |
417 | 414 | ||
418 | /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ | 415 | /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ |
@@ -594,7 +591,6 @@ void __init pidmap_init(void) | |||
594 | /* Reserve PID 0. We never call free_pidmap(0) */ | 591 | /* Reserve PID 0. We never call free_pidmap(0) */ |
595 | set_bit(0, init_pid_ns.pidmap[0].page); | 592 | set_bit(0, init_pid_ns.pidmap[0].page); |
596 | atomic_dec(&init_pid_ns.pidmap[0].nr_free); | 593 | atomic_dec(&init_pid_ns.pidmap[0].nr_free); |
597 | init_pid_ns.nr_hashed = PIDNS_HASH_ADDING; | ||
598 | 594 | ||
599 | init_pid_ns.pid_cachep = KMEM_CACHE(pid, | 595 | init_pid_ns.pid_cachep = KMEM_CACHE(pid, |
600 | SLAB_HWCACHE_ALIGN | SLAB_PANIC); | 596 | SLAB_HWCACHE_ALIGN | SLAB_PANIC); |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 42670e9b44e0..c7f31aa272f7 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -51,59 +51,28 @@ static int check_clock(const clockid_t which_clock) | |||
51 | return error; | 51 | return error; |
52 | } | 52 | } |
53 | 53 | ||
54 | static inline union cpu_time_count | 54 | static inline unsigned long long |
55 | timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) | 55 | timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) |
56 | { | 56 | { |
57 | union cpu_time_count ret; | 57 | unsigned long long ret; |
58 | ret.sched = 0; /* high half always zero when .cpu used */ | 58 | |
59 | ret = 0; /* high half always zero when .cpu used */ | ||
59 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | 60 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { |
60 | ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; | 61 | ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; |
61 | } else { | 62 | } else { |
62 | ret.cpu = timespec_to_cputime(tp); | 63 | ret = cputime_to_expires(timespec_to_cputime(tp)); |
63 | } | 64 | } |
64 | return ret; | 65 | return ret; |
65 | } | 66 | } |
66 | 67 | ||
67 | static void sample_to_timespec(const clockid_t which_clock, | 68 | static void sample_to_timespec(const clockid_t which_clock, |
68 | union cpu_time_count cpu, | 69 | unsigned long long expires, |
69 | struct timespec *tp) | 70 | struct timespec *tp) |
70 | { | 71 | { |
71 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) | 72 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) |
72 | *tp = ns_to_timespec(cpu.sched); | 73 | *tp = ns_to_timespec(expires); |
73 | else | 74 | else |
74 | cputime_to_timespec(cpu.cpu, tp); | 75 | cputime_to_timespec((__force cputime_t)expires, tp); |
75 | } | ||
76 | |||
77 | static inline int cpu_time_before(const clockid_t which_clock, | ||
78 | union cpu_time_count now, | ||
79 | union cpu_time_count then) | ||
80 | { | ||
81 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | ||
82 | return now.sched < then.sched; | ||
83 | } else { | ||
84 | return now.cpu < then.cpu; | ||
85 | } | ||
86 | } | ||
87 | static inline void cpu_time_add(const clockid_t which_clock, | ||
88 | union cpu_time_count *acc, | ||
89 | union cpu_time_count val) | ||
90 | { | ||
91 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | ||
92 | acc->sched += val.sched; | ||
93 | } else { | ||
94 | acc->cpu += val.cpu; | ||
95 | } | ||
96 | } | ||
97 | static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, | ||
98 | union cpu_time_count a, | ||
99 | union cpu_time_count b) | ||
100 | { | ||
101 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | ||
102 | a.sched -= b.sched; | ||
103 | } else { | ||
104 | a.cpu -= b.cpu; | ||
105 | } | ||
106 | return a; | ||
107 | } | 76 | } |
108 | 77 | ||
109 | /* | 78 | /* |
@@ -111,47 +80,31 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, | |||
111 | * given the current clock sample. | 80 | * given the current clock sample. |
112 | */ | 81 | */ |
113 | static void bump_cpu_timer(struct k_itimer *timer, | 82 | static void bump_cpu_timer(struct k_itimer *timer, |
114 | union cpu_time_count now) | 83 | unsigned long long now) |
115 | { | 84 | { |
116 | int i; | 85 | int i; |
86 | unsigned long long delta, incr; | ||
117 | 87 | ||
118 | if (timer->it.cpu.incr.sched == 0) | 88 | if (timer->it.cpu.incr == 0) |
119 | return; | 89 | return; |
120 | 90 | ||
121 | if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { | 91 | if (now < timer->it.cpu.expires) |
122 | unsigned long long delta, incr; | 92 | return; |
123 | 93 | ||
124 | if (now.sched < timer->it.cpu.expires.sched) | 94 | incr = timer->it.cpu.incr; |
125 | return; | 95 | delta = now + incr - timer->it.cpu.expires; |
126 | incr = timer->it.cpu.incr.sched; | ||
127 | delta = now.sched + incr - timer->it.cpu.expires.sched; | ||
128 | /* Don't use (incr*2 < delta), incr*2 might overflow. */ | ||
129 | for (i = 0; incr < delta - incr; i++) | ||
130 | incr = incr << 1; | ||
131 | for (; i >= 0; incr >>= 1, i--) { | ||
132 | if (delta < incr) | ||
133 | continue; | ||
134 | timer->it.cpu.expires.sched += incr; | ||
135 | timer->it_overrun += 1 << i; | ||
136 | delta -= incr; | ||
137 | } | ||
138 | } else { | ||
139 | cputime_t delta, incr; | ||
140 | 96 | ||
141 | if (now.cpu < timer->it.cpu.expires.cpu) | 97 | /* Don't use (incr*2 < delta), incr*2 might overflow. */ |
142 | return; | 98 | for (i = 0; incr < delta - incr; i++) |
143 | incr = timer->it.cpu.incr.cpu; | 99 | incr = incr << 1; |
144 | delta = now.cpu + incr - timer->it.cpu.expires.cpu; | 100 | |
145 | /* Don't use (incr*2 < delta), incr*2 might overflow. */ | 101 | for (; i >= 0; incr >>= 1, i--) { |
146 | for (i = 0; incr < delta - incr; i++) | 102 | if (delta < incr) |
147 | incr += incr; | 103 | continue; |
148 | for (; i >= 0; incr = incr >> 1, i--) { | 104 | |
149 | if (delta < incr) | 105 | timer->it.cpu.expires += incr; |
150 | continue; | 106 | timer->it_overrun += 1 << i; |
151 | timer->it.cpu.expires.cpu += incr; | 107 | delta -= incr; |
152 | timer->it_overrun += 1 << i; | ||
153 | delta -= incr; | ||
154 | } | ||
155 | } | 108 | } |
156 | } | 109 | } |
157 | 110 | ||
@@ -170,21 +123,21 @@ static inline int task_cputime_zero(const struct task_cputime *cputime) | |||
170 | return 0; | 123 | return 0; |
171 | } | 124 | } |
172 | 125 | ||
173 | static inline cputime_t prof_ticks(struct task_struct *p) | 126 | static inline unsigned long long prof_ticks(struct task_struct *p) |
174 | { | 127 | { |
175 | cputime_t utime, stime; | 128 | cputime_t utime, stime; |
176 | 129 | ||
177 | task_cputime(p, &utime, &stime); | 130 | task_cputime(p, &utime, &stime); |
178 | 131 | ||
179 | return utime + stime; | 132 | return cputime_to_expires(utime + stime); |
180 | } | 133 | } |
181 | static inline cputime_t virt_ticks(struct task_struct *p) | 134 | static inline unsigned long long virt_ticks(struct task_struct *p) |
182 | { | 135 | { |
183 | cputime_t utime; | 136 | cputime_t utime; |
184 | 137 | ||
185 | task_cputime(p, &utime, NULL); | 138 | task_cputime(p, &utime, NULL); |
186 | 139 | ||
187 | return utime; | 140 | return cputime_to_expires(utime); |
188 | } | 141 | } |
189 | 142 | ||
190 | static int | 143 | static int |
@@ -225,19 +178,19 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) | |||
225 | * Sample a per-thread clock for the given task. | 178 | * Sample a per-thread clock for the given task. |
226 | */ | 179 | */ |
227 | static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, | 180 | static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, |
228 | union cpu_time_count *cpu) | 181 | unsigned long long *sample) |
229 | { | 182 | { |
230 | switch (CPUCLOCK_WHICH(which_clock)) { | 183 | switch (CPUCLOCK_WHICH(which_clock)) { |
231 | default: | 184 | default: |
232 | return -EINVAL; | 185 | return -EINVAL; |
233 | case CPUCLOCK_PROF: | 186 | case CPUCLOCK_PROF: |
234 | cpu->cpu = prof_ticks(p); | 187 | *sample = prof_ticks(p); |
235 | break; | 188 | break; |
236 | case CPUCLOCK_VIRT: | 189 | case CPUCLOCK_VIRT: |
237 | cpu->cpu = virt_ticks(p); | 190 | *sample = virt_ticks(p); |
238 | break; | 191 | break; |
239 | case CPUCLOCK_SCHED: | 192 | case CPUCLOCK_SCHED: |
240 | cpu->sched = task_sched_runtime(p); | 193 | *sample = task_sched_runtime(p); |
241 | break; | 194 | break; |
242 | } | 195 | } |
243 | return 0; | 196 | return 0; |
@@ -284,7 +237,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) | |||
284 | */ | 237 | */ |
285 | static int cpu_clock_sample_group(const clockid_t which_clock, | 238 | static int cpu_clock_sample_group(const clockid_t which_clock, |
286 | struct task_struct *p, | 239 | struct task_struct *p, |
287 | union cpu_time_count *cpu) | 240 | unsigned long long *sample) |
288 | { | 241 | { |
289 | struct task_cputime cputime; | 242 | struct task_cputime cputime; |
290 | 243 | ||
@@ -293,15 +246,15 @@ static int cpu_clock_sample_group(const clockid_t which_clock, | |||
293 | return -EINVAL; | 246 | return -EINVAL; |
294 | case CPUCLOCK_PROF: | 247 | case CPUCLOCK_PROF: |
295 | thread_group_cputime(p, &cputime); | 248 | thread_group_cputime(p, &cputime); |
296 | cpu->cpu = cputime.utime + cputime.stime; | 249 | *sample = cputime_to_expires(cputime.utime + cputime.stime); |
297 | break; | 250 | break; |
298 | case CPUCLOCK_VIRT: | 251 | case CPUCLOCK_VIRT: |
299 | thread_group_cputime(p, &cputime); | 252 | thread_group_cputime(p, &cputime); |
300 | cpu->cpu = cputime.utime; | 253 | *sample = cputime_to_expires(cputime.utime); |
301 | break; | 254 | break; |
302 | case CPUCLOCK_SCHED: | 255 | case CPUCLOCK_SCHED: |
303 | thread_group_cputime(p, &cputime); | 256 | thread_group_cputime(p, &cputime); |
304 | cpu->sched = cputime.sum_exec_runtime; | 257 | *sample = cputime.sum_exec_runtime; |
305 | break; | 258 | break; |
306 | } | 259 | } |
307 | return 0; | 260 | return 0; |
@@ -312,7 +265,7 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) | |||
312 | { | 265 | { |
313 | const pid_t pid = CPUCLOCK_PID(which_clock); | 266 | const pid_t pid = CPUCLOCK_PID(which_clock); |
314 | int error = -EINVAL; | 267 | int error = -EINVAL; |
315 | union cpu_time_count rtn; | 268 | unsigned long long rtn; |
316 | 269 | ||
317 | if (pid == 0) { | 270 | if (pid == 0) { |
318 | /* | 271 | /* |
@@ -446,6 +399,15 @@ static int posix_cpu_timer_del(struct k_itimer *timer) | |||
446 | return ret; | 399 | return ret; |
447 | } | 400 | } |
448 | 401 | ||
402 | static void cleanup_timers_list(struct list_head *head, | ||
403 | unsigned long long curr) | ||
404 | { | ||
405 | struct cpu_timer_list *timer, *next; | ||
406 | |||
407 | list_for_each_entry_safe(timer, next, head, entry) | ||
408 | list_del_init(&timer->entry); | ||
409 | } | ||
410 | |||
449 | /* | 411 | /* |
450 | * Clean out CPU timers still ticking when a thread exited. The task | 412 | * Clean out CPU timers still ticking when a thread exited. The task |
451 | * pointer is cleared, and the expiry time is replaced with the residual | 413 | * pointer is cleared, and the expiry time is replaced with the residual |
@@ -456,37 +418,12 @@ static void cleanup_timers(struct list_head *head, | |||
456 | cputime_t utime, cputime_t stime, | 418 | cputime_t utime, cputime_t stime, |
457 | unsigned long long sum_exec_runtime) | 419 | unsigned long long sum_exec_runtime) |
458 | { | 420 | { |
459 | struct cpu_timer_list *timer, *next; | ||
460 | cputime_t ptime = utime + stime; | ||
461 | |||
462 | list_for_each_entry_safe(timer, next, head, entry) { | ||
463 | list_del_init(&timer->entry); | ||
464 | if (timer->expires.cpu < ptime) { | ||
465 | timer->expires.cpu = 0; | ||
466 | } else { | ||
467 | timer->expires.cpu -= ptime; | ||
468 | } | ||
469 | } | ||
470 | 421 | ||
471 | ++head; | 422 | cputime_t ptime = utime + stime; |
472 | list_for_each_entry_safe(timer, next, head, entry) { | ||
473 | list_del_init(&timer->entry); | ||
474 | if (timer->expires.cpu < utime) { | ||
475 | timer->expires.cpu = 0; | ||
476 | } else { | ||
477 | timer->expires.cpu -= utime; | ||
478 | } | ||
479 | } | ||
480 | 423 | ||
481 | ++head; | 424 | cleanup_timers_list(head, cputime_to_expires(ptime)); |
482 | list_for_each_entry_safe(timer, next, head, entry) { | 425 | cleanup_timers_list(++head, cputime_to_expires(utime)); |
483 | list_del_init(&timer->entry); | 426 | cleanup_timers_list(++head, sum_exec_runtime); |
484 | if (timer->expires.sched < sum_exec_runtime) { | ||
485 | timer->expires.sched = 0; | ||
486 | } else { | ||
487 | timer->expires.sched -= sum_exec_runtime; | ||
488 | } | ||
489 | } | ||
490 | } | 427 | } |
491 | 428 | ||
492 | /* | 429 | /* |
@@ -516,17 +453,21 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) | |||
516 | tsk->se.sum_exec_runtime + sig->sum_sched_runtime); | 453 | tsk->se.sum_exec_runtime + sig->sum_sched_runtime); |
517 | } | 454 | } |
518 | 455 | ||
519 | static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) | 456 | static void clear_dead_task(struct k_itimer *itimer, unsigned long long now) |
520 | { | 457 | { |
458 | struct cpu_timer_list *timer = &itimer->it.cpu; | ||
459 | |||
521 | /* | 460 | /* |
522 | * That's all for this thread or process. | 461 | * That's all for this thread or process. |
523 | * We leave our residual in expires to be reported. | 462 | * We leave our residual in expires to be reported. |
524 | */ | 463 | */ |
525 | put_task_struct(timer->it.cpu.task); | 464 | put_task_struct(timer->task); |
526 | timer->it.cpu.task = NULL; | 465 | timer->task = NULL; |
527 | timer->it.cpu.expires = cpu_time_sub(timer->it_clock, | 466 | if (timer->expires < now) { |
528 | timer->it.cpu.expires, | 467 | timer->expires = 0; |
529 | now); | 468 | } else { |
469 | timer->expires -= now; | ||
470 | } | ||
530 | } | 471 | } |
531 | 472 | ||
532 | static inline int expires_gt(cputime_t expires, cputime_t new_exp) | 473 | static inline int expires_gt(cputime_t expires, cputime_t new_exp) |
@@ -558,14 +499,14 @@ static void arm_timer(struct k_itimer *timer) | |||
558 | 499 | ||
559 | listpos = head; | 500 | listpos = head; |
560 | list_for_each_entry(next, head, entry) { | 501 | list_for_each_entry(next, head, entry) { |
561 | if (cpu_time_before(timer->it_clock, nt->expires, next->expires)) | 502 | if (nt->expires < next->expires) |
562 | break; | 503 | break; |
563 | listpos = &next->entry; | 504 | listpos = &next->entry; |
564 | } | 505 | } |
565 | list_add(&nt->entry, listpos); | 506 | list_add(&nt->entry, listpos); |
566 | 507 | ||
567 | if (listpos == head) { | 508 | if (listpos == head) { |
568 | union cpu_time_count *exp = &nt->expires; | 509 | unsigned long long exp = nt->expires; |
569 | 510 | ||
570 | /* | 511 | /* |
571 | * We are the new earliest-expiring POSIX 1.b timer, hence | 512 | * We are the new earliest-expiring POSIX 1.b timer, hence |
@@ -576,17 +517,17 @@ static void arm_timer(struct k_itimer *timer) | |||
576 | 517 | ||
577 | switch (CPUCLOCK_WHICH(timer->it_clock)) { | 518 | switch (CPUCLOCK_WHICH(timer->it_clock)) { |
578 | case CPUCLOCK_PROF: | 519 | case CPUCLOCK_PROF: |
579 | if (expires_gt(cputime_expires->prof_exp, exp->cpu)) | 520 | if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp))) |
580 | cputime_expires->prof_exp = exp->cpu; | 521 | cputime_expires->prof_exp = expires_to_cputime(exp); |
581 | break; | 522 | break; |
582 | case CPUCLOCK_VIRT: | 523 | case CPUCLOCK_VIRT: |
583 | if (expires_gt(cputime_expires->virt_exp, exp->cpu)) | 524 | if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp))) |
584 | cputime_expires->virt_exp = exp->cpu; | 525 | cputime_expires->virt_exp = expires_to_cputime(exp); |
585 | break; | 526 | break; |
586 | case CPUCLOCK_SCHED: | 527 | case CPUCLOCK_SCHED: |
587 | if (cputime_expires->sched_exp == 0 || | 528 | if (cputime_expires->sched_exp == 0 || |
588 | cputime_expires->sched_exp > exp->sched) | 529 | cputime_expires->sched_exp > exp) |
589 | cputime_expires->sched_exp = exp->sched; | 530 | cputime_expires->sched_exp = exp; |
590 | break; | 531 | break; |
591 | } | 532 | } |
592 | } | 533 | } |
@@ -601,20 +542,20 @@ static void cpu_timer_fire(struct k_itimer *timer) | |||
601 | /* | 542 | /* |
602 | * User don't want any signal. | 543 | * User don't want any signal. |
603 | */ | 544 | */ |
604 | timer->it.cpu.expires.sched = 0; | 545 | timer->it.cpu.expires = 0; |
605 | } else if (unlikely(timer->sigq == NULL)) { | 546 | } else if (unlikely(timer->sigq == NULL)) { |
606 | /* | 547 | /* |
607 | * This a special case for clock_nanosleep, | 548 | * This a special case for clock_nanosleep, |
608 | * not a normal timer from sys_timer_create. | 549 | * not a normal timer from sys_timer_create. |
609 | */ | 550 | */ |
610 | wake_up_process(timer->it_process); | 551 | wake_up_process(timer->it_process); |
611 | timer->it.cpu.expires.sched = 0; | 552 | timer->it.cpu.expires = 0; |
612 | } else if (timer->it.cpu.incr.sched == 0) { | 553 | } else if (timer->it.cpu.incr == 0) { |
613 | /* | 554 | /* |
614 | * One-shot timer. Clear it as soon as it's fired. | 555 | * One-shot timer. Clear it as soon as it's fired. |
615 | */ | 556 | */ |
616 | posix_timer_event(timer, 0); | 557 | posix_timer_event(timer, 0); |
617 | timer->it.cpu.expires.sched = 0; | 558 | timer->it.cpu.expires = 0; |
618 | } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { | 559 | } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { |
619 | /* | 560 | /* |
620 | * The signal did not get queued because the signal | 561 | * The signal did not get queued because the signal |
@@ -632,7 +573,7 @@ static void cpu_timer_fire(struct k_itimer *timer) | |||
632 | */ | 573 | */ |
633 | static int cpu_timer_sample_group(const clockid_t which_clock, | 574 | static int cpu_timer_sample_group(const clockid_t which_clock, |
634 | struct task_struct *p, | 575 | struct task_struct *p, |
635 | union cpu_time_count *cpu) | 576 | unsigned long long *sample) |
636 | { | 577 | { |
637 | struct task_cputime cputime; | 578 | struct task_cputime cputime; |
638 | 579 | ||
@@ -641,13 +582,13 @@ static int cpu_timer_sample_group(const clockid_t which_clock, | |||
641 | default: | 582 | default: |
642 | return -EINVAL; | 583 | return -EINVAL; |
643 | case CPUCLOCK_PROF: | 584 | case CPUCLOCK_PROF: |
644 | cpu->cpu = cputime.utime + cputime.stime; | 585 | *sample = cputime_to_expires(cputime.utime + cputime.stime); |
645 | break; | 586 | break; |
646 | case CPUCLOCK_VIRT: | 587 | case CPUCLOCK_VIRT: |
647 | cpu->cpu = cputime.utime; | 588 | *sample = cputime_to_expires(cputime.utime); |
648 | break; | 589 | break; |
649 | case CPUCLOCK_SCHED: | 590 | case CPUCLOCK_SCHED: |
650 | cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); | 591 | *sample = cputime.sum_exec_runtime + task_delta_exec(p); |
651 | break; | 592 | break; |
652 | } | 593 | } |
653 | return 0; | 594 | return 0; |
@@ -694,7 +635,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
694 | struct itimerspec *new, struct itimerspec *old) | 635 | struct itimerspec *new, struct itimerspec *old) |
695 | { | 636 | { |
696 | struct task_struct *p = timer->it.cpu.task; | 637 | struct task_struct *p = timer->it.cpu.task; |
697 | union cpu_time_count old_expires, new_expires, old_incr, val; | 638 | unsigned long long old_expires, new_expires, old_incr, val; |
698 | int ret; | 639 | int ret; |
699 | 640 | ||
700 | if (unlikely(p == NULL)) { | 641 | if (unlikely(p == NULL)) { |
@@ -749,7 +690,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
749 | } | 690 | } |
750 | 691 | ||
751 | if (old) { | 692 | if (old) { |
752 | if (old_expires.sched == 0) { | 693 | if (old_expires == 0) { |
753 | old->it_value.tv_sec = 0; | 694 | old->it_value.tv_sec = 0; |
754 | old->it_value.tv_nsec = 0; | 695 | old->it_value.tv_nsec = 0; |
755 | } else { | 696 | } else { |
@@ -764,11 +705,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
764 | * new setting. | 705 | * new setting. |
765 | */ | 706 | */ |
766 | bump_cpu_timer(timer, val); | 707 | bump_cpu_timer(timer, val); |
767 | if (cpu_time_before(timer->it_clock, val, | 708 | if (val < timer->it.cpu.expires) { |
768 | timer->it.cpu.expires)) { | 709 | old_expires = timer->it.cpu.expires - val; |
769 | old_expires = cpu_time_sub( | ||
770 | timer->it_clock, | ||
771 | timer->it.cpu.expires, val); | ||
772 | sample_to_timespec(timer->it_clock, | 710 | sample_to_timespec(timer->it_clock, |
773 | old_expires, | 711 | old_expires, |
774 | &old->it_value); | 712 | &old->it_value); |
@@ -791,8 +729,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
791 | goto out; | 729 | goto out; |
792 | } | 730 | } |
793 | 731 | ||
794 | if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) { | 732 | if (new_expires != 0 && !(flags & TIMER_ABSTIME)) { |
795 | cpu_time_add(timer->it_clock, &new_expires, val); | 733 | new_expires += val; |
796 | } | 734 | } |
797 | 735 | ||
798 | /* | 736 | /* |
@@ -801,8 +739,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
801 | * arm the timer (we'll just fake it for timer_gettime). | 739 | * arm the timer (we'll just fake it for timer_gettime). |
802 | */ | 740 | */ |
803 | timer->it.cpu.expires = new_expires; | 741 | timer->it.cpu.expires = new_expires; |
804 | if (new_expires.sched != 0 && | 742 | if (new_expires != 0 && val < new_expires) { |
805 | cpu_time_before(timer->it_clock, val, new_expires)) { | ||
806 | arm_timer(timer); | 743 | arm_timer(timer); |
807 | } | 744 | } |
808 | 745 | ||
@@ -826,8 +763,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
826 | timer->it_overrun_last = 0; | 763 | timer->it_overrun_last = 0; |
827 | timer->it_overrun = -1; | 764 | timer->it_overrun = -1; |
828 | 765 | ||
829 | if (new_expires.sched != 0 && | 766 | if (new_expires != 0 && !(val < new_expires)) { |
830 | !cpu_time_before(timer->it_clock, val, new_expires)) { | ||
831 | /* | 767 | /* |
832 | * The designated time already passed, so we notify | 768 | * The designated time already passed, so we notify |
833 | * immediately, even if the thread never runs to | 769 | * immediately, even if the thread never runs to |
@@ -849,7 +785,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
849 | 785 | ||
850 | static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | 786 | static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) |
851 | { | 787 | { |
852 | union cpu_time_count now; | 788 | unsigned long long now; |
853 | struct task_struct *p = timer->it.cpu.task; | 789 | struct task_struct *p = timer->it.cpu.task; |
854 | int clear_dead; | 790 | int clear_dead; |
855 | 791 | ||
@@ -859,7 +795,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | |||
859 | sample_to_timespec(timer->it_clock, | 795 | sample_to_timespec(timer->it_clock, |
860 | timer->it.cpu.incr, &itp->it_interval); | 796 | timer->it.cpu.incr, &itp->it_interval); |
861 | 797 | ||
862 | if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */ | 798 | if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ |
863 | itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; | 799 | itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; |
864 | return; | 800 | return; |
865 | } | 801 | } |
@@ -891,7 +827,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | |||
891 | */ | 827 | */ |
892 | put_task_struct(p); | 828 | put_task_struct(p); |
893 | timer->it.cpu.task = NULL; | 829 | timer->it.cpu.task = NULL; |
894 | timer->it.cpu.expires.sched = 0; | 830 | timer->it.cpu.expires = 0; |
895 | read_unlock(&tasklist_lock); | 831 | read_unlock(&tasklist_lock); |
896 | goto dead; | 832 | goto dead; |
897 | } else { | 833 | } else { |
@@ -912,10 +848,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | |||
912 | goto dead; | 848 | goto dead; |
913 | } | 849 | } |
914 | 850 | ||
915 | if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) { | 851 | if (now < timer->it.cpu.expires) { |
916 | sample_to_timespec(timer->it_clock, | 852 | sample_to_timespec(timer->it_clock, |
917 | cpu_time_sub(timer->it_clock, | 853 | timer->it.cpu.expires - now, |
918 | timer->it.cpu.expires, now), | ||
919 | &itp->it_value); | 854 | &itp->it_value); |
920 | } else { | 855 | } else { |
921 | /* | 856 | /* |
@@ -927,6 +862,28 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | |||
927 | } | 862 | } |
928 | } | 863 | } |
929 | 864 | ||
865 | static unsigned long long | ||
866 | check_timers_list(struct list_head *timers, | ||
867 | struct list_head *firing, | ||
868 | unsigned long long curr) | ||
869 | { | ||
870 | int maxfire = 20; | ||
871 | |||
872 | while (!list_empty(timers)) { | ||
873 | struct cpu_timer_list *t; | ||
874 | |||
875 | t = list_first_entry(timers, struct cpu_timer_list, entry); | ||
876 | |||
877 | if (!--maxfire || curr < t->expires) | ||
878 | return t->expires; | ||
879 | |||
880 | t->firing = 1; | ||
881 | list_move_tail(&t->entry, firing); | ||
882 | } | ||
883 | |||
884 | return 0; | ||
885 | } | ||
886 | |||
930 | /* | 887 | /* |
931 | * Check for any per-thread CPU timers that have fired and move them off | 888 | * Check for any per-thread CPU timers that have fired and move them off |
932 | * the tsk->cpu_timers[N] list onto the firing list. Here we update the | 889 | * the tsk->cpu_timers[N] list onto the firing list. Here we update the |
@@ -935,54 +892,20 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | |||
935 | static void check_thread_timers(struct task_struct *tsk, | 892 | static void check_thread_timers(struct task_struct *tsk, |
936 | struct list_head *firing) | 893 | struct list_head *firing) |
937 | { | 894 | { |
938 | int maxfire; | ||
939 | struct list_head *timers = tsk->cpu_timers; | 895 | struct list_head *timers = tsk->cpu_timers; |
940 | struct signal_struct *const sig = tsk->signal; | 896 | struct signal_struct *const sig = tsk->signal; |
897 | struct task_cputime *tsk_expires = &tsk->cputime_expires; | ||
898 | unsigned long long expires; | ||
941 | unsigned long soft; | 899 | unsigned long soft; |
942 | 900 | ||
943 | maxfire = 20; | 901 | expires = check_timers_list(timers, firing, prof_ticks(tsk)); |
944 | tsk->cputime_expires.prof_exp = 0; | 902 | tsk_expires->prof_exp = expires_to_cputime(expires); |
945 | while (!list_empty(timers)) { | ||
946 | struct cpu_timer_list *t = list_first_entry(timers, | ||
947 | struct cpu_timer_list, | ||
948 | entry); | ||
949 | if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) { | ||
950 | tsk->cputime_expires.prof_exp = t->expires.cpu; | ||
951 | break; | ||
952 | } | ||
953 | t->firing = 1; | ||
954 | list_move_tail(&t->entry, firing); | ||
955 | } | ||
956 | 903 | ||
957 | ++timers; | 904 | expires = check_timers_list(++timers, firing, virt_ticks(tsk)); |
958 | maxfire = 20; | 905 | tsk_expires->virt_exp = expires_to_cputime(expires); |
959 | tsk->cputime_expires.virt_exp = 0; | ||
960 | while (!list_empty(timers)) { | ||
961 | struct cpu_timer_list *t = list_first_entry(timers, | ||
962 | struct cpu_timer_list, | ||
963 | entry); | ||
964 | if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) { | ||
965 | tsk->cputime_expires.virt_exp = t->expires.cpu; | ||
966 | break; | ||
967 | } | ||
968 | t->firing = 1; | ||
969 | list_move_tail(&t->entry, firing); | ||
970 | } | ||
971 | 906 | ||
972 | ++timers; | 907 | tsk_expires->sched_exp = check_timers_list(++timers, firing, |
973 | maxfire = 20; | 908 | tsk->se.sum_exec_runtime); |
974 | tsk->cputime_expires.sched_exp = 0; | ||
975 | while (!list_empty(timers)) { | ||
976 | struct cpu_timer_list *t = list_first_entry(timers, | ||
977 | struct cpu_timer_list, | ||
978 | entry); | ||
979 | if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { | ||
980 | tsk->cputime_expires.sched_exp = t->expires.sched; | ||
981 | break; | ||
982 | } | ||
983 | t->firing = 1; | ||
984 | list_move_tail(&t->entry, firing); | ||
985 | } | ||
986 | 909 | ||
987 | /* | 910 | /* |
988 | * Check for the special case thread timers. | 911 | * Check for the special case thread timers. |
@@ -1030,7 +953,8 @@ static void stop_process_timers(struct signal_struct *sig) | |||
1030 | static u32 onecputick; | 953 | static u32 onecputick; |
1031 | 954 | ||
1032 | static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, | 955 | static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, |
1033 | cputime_t *expires, cputime_t cur_time, int signo) | 956 | unsigned long long *expires, |
957 | unsigned long long cur_time, int signo) | ||
1034 | { | 958 | { |
1035 | if (!it->expires) | 959 | if (!it->expires) |
1036 | return; | 960 | return; |
@@ -1066,9 +990,8 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, | |||
1066 | static void check_process_timers(struct task_struct *tsk, | 990 | static void check_process_timers(struct task_struct *tsk, |
1067 | struct list_head *firing) | 991 | struct list_head *firing) |
1068 | { | 992 | { |
1069 | int maxfire; | ||
1070 | struct signal_struct *const sig = tsk->signal; | 993 | struct signal_struct *const sig = tsk->signal; |
1071 | cputime_t utime, ptime, virt_expires, prof_expires; | 994 | unsigned long long utime, ptime, virt_expires, prof_expires; |
1072 | unsigned long long sum_sched_runtime, sched_expires; | 995 | unsigned long long sum_sched_runtime, sched_expires; |
1073 | struct list_head *timers = sig->cpu_timers; | 996 | struct list_head *timers = sig->cpu_timers; |
1074 | struct task_cputime cputime; | 997 | struct task_cputime cputime; |
@@ -1078,52 +1001,13 @@ static void check_process_timers(struct task_struct *tsk, | |||
1078 | * Collect the current process totals. | 1001 | * Collect the current process totals. |
1079 | */ | 1002 | */ |
1080 | thread_group_cputimer(tsk, &cputime); | 1003 | thread_group_cputimer(tsk, &cputime); |
1081 | utime = cputime.utime; | 1004 | utime = cputime_to_expires(cputime.utime); |
1082 | ptime = utime + cputime.stime; | 1005 | ptime = utime + cputime_to_expires(cputime.stime); |
1083 | sum_sched_runtime = cputime.sum_exec_runtime; | 1006 | sum_sched_runtime = cputime.sum_exec_runtime; |
1084 | maxfire = 20; | ||
1085 | prof_expires = 0; | ||
1086 | while (!list_empty(timers)) { | ||
1087 | struct cpu_timer_list *tl = list_first_entry(timers, | ||
1088 | struct cpu_timer_list, | ||
1089 | entry); | ||
1090 | if (!--maxfire || ptime < tl->expires.cpu) { | ||
1091 | prof_expires = tl->expires.cpu; | ||
1092 | break; | ||
1093 | } | ||
1094 | tl->firing = 1; | ||
1095 | list_move_tail(&tl->entry, firing); | ||
1096 | } | ||
1097 | 1007 | ||
1098 | ++timers; | 1008 | prof_expires = check_timers_list(timers, firing, ptime); |
1099 | maxfire = 20; | 1009 | virt_expires = check_timers_list(++timers, firing, utime); |
1100 | virt_expires = 0; | 1010 | sched_expires = check_timers_list(++timers, firing, sum_sched_runtime); |
1101 | while (!list_empty(timers)) { | ||
1102 | struct cpu_timer_list *tl = list_first_entry(timers, | ||
1103 | struct cpu_timer_list, | ||
1104 | entry); | ||
1105 | if (!--maxfire || utime < tl->expires.cpu) { | ||
1106 | virt_expires = tl->expires.cpu; | ||
1107 | break; | ||
1108 | } | ||
1109 | tl->firing = 1; | ||
1110 | list_move_tail(&tl->entry, firing); | ||
1111 | } | ||
1112 | |||
1113 | ++timers; | ||
1114 | maxfire = 20; | ||
1115 | sched_expires = 0; | ||
1116 | while (!list_empty(timers)) { | ||
1117 | struct cpu_timer_list *tl = list_first_entry(timers, | ||
1118 | struct cpu_timer_list, | ||
1119 | entry); | ||
1120 | if (!--maxfire || sum_sched_runtime < tl->expires.sched) { | ||
1121 | sched_expires = tl->expires.sched; | ||
1122 | break; | ||
1123 | } | ||
1124 | tl->firing = 1; | ||
1125 | list_move_tail(&tl->entry, firing); | ||
1126 | } | ||
1127 | 1011 | ||
1128 | /* | 1012 | /* |
1129 | * Check for the special case process timers. | 1013 | * Check for the special case process timers. |
@@ -1162,8 +1046,8 @@ static void check_process_timers(struct task_struct *tsk, | |||
1162 | } | 1046 | } |
1163 | } | 1047 | } |
1164 | 1048 | ||
1165 | sig->cputime_expires.prof_exp = prof_expires; | 1049 | sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires); |
1166 | sig->cputime_expires.virt_exp = virt_expires; | 1050 | sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires); |
1167 | sig->cputime_expires.sched_exp = sched_expires; | 1051 | sig->cputime_expires.sched_exp = sched_expires; |
1168 | if (task_cputime_zero(&sig->cputime_expires)) | 1052 | if (task_cputime_zero(&sig->cputime_expires)) |
1169 | stop_process_timers(sig); | 1053 | stop_process_timers(sig); |
@@ -1176,7 +1060,7 @@ static void check_process_timers(struct task_struct *tsk, | |||
1176 | void posix_cpu_timer_schedule(struct k_itimer *timer) | 1060 | void posix_cpu_timer_schedule(struct k_itimer *timer) |
1177 | { | 1061 | { |
1178 | struct task_struct *p = timer->it.cpu.task; | 1062 | struct task_struct *p = timer->it.cpu.task; |
1179 | union cpu_time_count now; | 1063 | unsigned long long now; |
1180 | 1064 | ||
1181 | if (unlikely(p == NULL)) | 1065 | if (unlikely(p == NULL)) |
1182 | /* | 1066 | /* |
@@ -1205,7 +1089,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
1205 | */ | 1089 | */ |
1206 | put_task_struct(p); | 1090 | put_task_struct(p); |
1207 | timer->it.cpu.task = p = NULL; | 1091 | timer->it.cpu.task = p = NULL; |
1208 | timer->it.cpu.expires.sched = 0; | 1092 | timer->it.cpu.expires = 0; |
1209 | goto out_unlock; | 1093 | goto out_unlock; |
1210 | } else if (unlikely(p->exit_state) && thread_group_empty(p)) { | 1094 | } else if (unlikely(p->exit_state) && thread_group_empty(p)) { |
1211 | /* | 1095 | /* |
@@ -1213,6 +1097,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
1213 | * not yet reaped. Take this opportunity to | 1097 | * not yet reaped. Take this opportunity to |
1214 | * drop our task ref. | 1098 | * drop our task ref. |
1215 | */ | 1099 | */ |
1100 | cpu_timer_sample_group(timer->it_clock, p, &now); | ||
1216 | clear_dead_task(timer, now); | 1101 | clear_dead_task(timer, now); |
1217 | goto out_unlock; | 1102 | goto out_unlock; |
1218 | } | 1103 | } |
@@ -1387,7 +1272,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1387 | void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | 1272 | void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, |
1388 | cputime_t *newval, cputime_t *oldval) | 1273 | cputime_t *newval, cputime_t *oldval) |
1389 | { | 1274 | { |
1390 | union cpu_time_count now; | 1275 | unsigned long long now; |
1391 | 1276 | ||
1392 | BUG_ON(clock_idx == CPUCLOCK_SCHED); | 1277 | BUG_ON(clock_idx == CPUCLOCK_SCHED); |
1393 | cpu_timer_sample_group(clock_idx, tsk, &now); | 1278 | cpu_timer_sample_group(clock_idx, tsk, &now); |
@@ -1399,17 +1284,17 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | |||
1399 | * it to be absolute. | 1284 | * it to be absolute. |
1400 | */ | 1285 | */ |
1401 | if (*oldval) { | 1286 | if (*oldval) { |
1402 | if (*oldval <= now.cpu) { | 1287 | if (*oldval <= now) { |
1403 | /* Just about to fire. */ | 1288 | /* Just about to fire. */ |
1404 | *oldval = cputime_one_jiffy; | 1289 | *oldval = cputime_one_jiffy; |
1405 | } else { | 1290 | } else { |
1406 | *oldval -= now.cpu; | 1291 | *oldval -= now; |
1407 | } | 1292 | } |
1408 | } | 1293 | } |
1409 | 1294 | ||
1410 | if (!*newval) | 1295 | if (!*newval) |
1411 | goto out; | 1296 | goto out; |
1412 | *newval += now.cpu; | 1297 | *newval += now; |
1413 | } | 1298 | } |
1414 | 1299 | ||
1415 | /* | 1300 | /* |
@@ -1459,7 +1344,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | |||
1459 | } | 1344 | } |
1460 | 1345 | ||
1461 | while (!signal_pending(current)) { | 1346 | while (!signal_pending(current)) { |
1462 | if (timer.it.cpu.expires.sched == 0) { | 1347 | if (timer.it.cpu.expires == 0) { |
1463 | /* | 1348 | /* |
1464 | * Our timer fired and was reset, below | 1349 | * Our timer fired and was reset, below |
1465 | * deletion can not fail. | 1350 | * deletion can not fail. |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 9c39de095ba9..d444c4e834f4 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -262,6 +262,26 @@ config PM_GENERIC_DOMAINS | |||
262 | bool | 262 | bool |
263 | depends on PM | 263 | depends on PM |
264 | 264 | ||
265 | config WQ_POWER_EFFICIENT_DEFAULT | ||
266 | bool "Enable workqueue power-efficient mode by default" | ||
267 | depends on PM | ||
268 | default n | ||
269 | help | ||
270 | Per-cpu workqueues are generally preferred because they show | ||
271 | better performance thanks to cache locality; unfortunately, | ||
272 | per-cpu workqueues tend to be more power hungry than unbound | ||
273 | workqueues. | ||
274 | |||
275 | Enabling workqueue.power_efficient kernel parameter makes the | ||
276 | per-cpu workqueues which were observed to contribute | ||
277 | significantly to power consumption unbound, leading to measurably | ||
278 | lower power usage at the cost of small performance overhead. | ||
279 | |||
280 | This config option determines whether workqueue.power_efficient | ||
281 | is enabled by default. | ||
282 | |||
283 | If in doubt, say N. | ||
284 | |||
265 | config PM_GENERIC_DOMAINS_SLEEP | 285 | config PM_GENERIC_DOMAINS_SLEEP |
266 | def_bool y | 286 | def_bool y |
267 | depends on PM_SLEEP && PM_GENERIC_DOMAINS | 287 | depends on PM_SLEEP && PM_GENERIC_DOMAINS |
diff --git a/kernel/power/main.c b/kernel/power/main.c index d77663bfedeb..1d1bf630e6e9 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -424,6 +424,8 @@ static ssize_t wakeup_count_store(struct kobject *kobj, | |||
424 | if (sscanf(buf, "%u", &val) == 1) { | 424 | if (sscanf(buf, "%u", &val) == 1) { |
425 | if (pm_save_wakeup_count(val)) | 425 | if (pm_save_wakeup_count(val)) |
426 | error = n; | 426 | error = n; |
427 | else | ||
428 | pm_print_active_wakeup_sources(); | ||
427 | } | 429 | } |
428 | 430 | ||
429 | out: | 431 | out: |
@@ -528,6 +530,10 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
528 | 530 | ||
529 | if (sscanf(buf, "%d", &val) == 1) { | 531 | if (sscanf(buf, "%d", &val) == 1) { |
530 | pm_trace_enabled = !!val; | 532 | pm_trace_enabled = !!val; |
533 | if (pm_trace_enabled) { | ||
534 | pr_warn("PM: Enabling pm_trace changes system date and time during resume.\n" | ||
535 | "PM: Correct system time has to be restored manually after resume.\n"); | ||
536 | } | ||
531 | return n; | 537 | return n; |
532 | } | 538 | } |
533 | return -EINVAL; | 539 | return -EINVAL; |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 98088e0e71e8..fc0df8486449 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -30,9 +30,10 @@ static int try_to_freeze_tasks(bool user_only) | |||
30 | unsigned int todo; | 30 | unsigned int todo; |
31 | bool wq_busy = false; | 31 | bool wq_busy = false; |
32 | struct timeval start, end; | 32 | struct timeval start, end; |
33 | u64 elapsed_csecs64; | 33 | u64 elapsed_msecs64; |
34 | unsigned int elapsed_csecs; | 34 | unsigned int elapsed_msecs; |
35 | bool wakeup = false; | 35 | bool wakeup = false; |
36 | int sleep_usecs = USEC_PER_MSEC; | ||
36 | 37 | ||
37 | do_gettimeofday(&start); | 38 | do_gettimeofday(&start); |
38 | 39 | ||
@@ -68,22 +69,25 @@ static int try_to_freeze_tasks(bool user_only) | |||
68 | 69 | ||
69 | /* | 70 | /* |
70 | * We need to retry, but first give the freezing tasks some | 71 | * We need to retry, but first give the freezing tasks some |
71 | * time to enter the refrigerator. | 72 | * time to enter the refrigerator. Start with an initial |
73 | * 1 ms sleep followed by exponential backoff until 8 ms. | ||
72 | */ | 74 | */ |
73 | msleep(10); | 75 | usleep_range(sleep_usecs / 2, sleep_usecs); |
76 | if (sleep_usecs < 8 * USEC_PER_MSEC) | ||
77 | sleep_usecs *= 2; | ||
74 | } | 78 | } |
75 | 79 | ||
76 | do_gettimeofday(&end); | 80 | do_gettimeofday(&end); |
77 | elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); | 81 | elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); |
78 | do_div(elapsed_csecs64, NSEC_PER_SEC / 100); | 82 | do_div(elapsed_msecs64, NSEC_PER_MSEC); |
79 | elapsed_csecs = elapsed_csecs64; | 83 | elapsed_msecs = elapsed_msecs64; |
80 | 84 | ||
81 | if (todo) { | 85 | if (todo) { |
82 | printk("\n"); | 86 | printk("\n"); |
83 | printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " | 87 | printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds " |
84 | "(%d tasks refusing to freeze, wq_busy=%d):\n", | 88 | "(%d tasks refusing to freeze, wq_busy=%d):\n", |
85 | wakeup ? "aborted" : "failed", | 89 | wakeup ? "aborted" : "failed", |
86 | elapsed_csecs / 100, elapsed_csecs % 100, | 90 | elapsed_msecs / 1000, elapsed_msecs % 1000, |
87 | todo - wq_busy, wq_busy); | 91 | todo - wq_busy, wq_busy); |
88 | 92 | ||
89 | if (!wakeup) { | 93 | if (!wakeup) { |
@@ -96,8 +100,8 @@ static int try_to_freeze_tasks(bool user_only) | |||
96 | read_unlock(&tasklist_lock); | 100 | read_unlock(&tasklist_lock); |
97 | } | 101 | } |
98 | } else { | 102 | } else { |
99 | printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, | 103 | printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, |
100 | elapsed_csecs % 100); | 104 | elapsed_msecs % 1000); |
101 | } | 105 | } |
102 | 106 | ||
103 | return todo ? -EBUSY : 0; | 107 | return todo ? -EBUSY : 0; |
diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 587dddeebf15..06fe28589e9c 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c | |||
@@ -44,6 +44,7 @@ | |||
44 | 44 | ||
45 | #include <linux/uaccess.h> | 45 | #include <linux/uaccess.h> |
46 | #include <linux/export.h> | 46 | #include <linux/export.h> |
47 | #include <trace/events/power.h> | ||
47 | 48 | ||
48 | /* | 49 | /* |
49 | * locking rule: all changes to constraints or notifiers lists | 50 | * locking rule: all changes to constraints or notifiers lists |
@@ -202,6 +203,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, | |||
202 | 203 | ||
203 | spin_unlock_irqrestore(&pm_qos_lock, flags); | 204 | spin_unlock_irqrestore(&pm_qos_lock, flags); |
204 | 205 | ||
206 | trace_pm_qos_update_target(action, prev_value, curr_value); | ||
205 | if (prev_value != curr_value) { | 207 | if (prev_value != curr_value) { |
206 | blocking_notifier_call_chain(c->notifiers, | 208 | blocking_notifier_call_chain(c->notifiers, |
207 | (unsigned long)curr_value, | 209 | (unsigned long)curr_value, |
@@ -272,6 +274,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf, | |||
272 | 274 | ||
273 | spin_unlock_irqrestore(&pm_qos_lock, irqflags); | 275 | spin_unlock_irqrestore(&pm_qos_lock, irqflags); |
274 | 276 | ||
277 | trace_pm_qos_update_flags(action, prev_value, curr_value); | ||
275 | return prev_value != curr_value; | 278 | return prev_value != curr_value; |
276 | } | 279 | } |
277 | 280 | ||
@@ -333,6 +336,7 @@ void pm_qos_add_request(struct pm_qos_request *req, | |||
333 | } | 336 | } |
334 | req->pm_qos_class = pm_qos_class; | 337 | req->pm_qos_class = pm_qos_class; |
335 | INIT_DELAYED_WORK(&req->work, pm_qos_work_fn); | 338 | INIT_DELAYED_WORK(&req->work, pm_qos_work_fn); |
339 | trace_pm_qos_add_request(pm_qos_class, value); | ||
336 | pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, | 340 | pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, |
337 | &req->node, PM_QOS_ADD_REQ, value); | 341 | &req->node, PM_QOS_ADD_REQ, value); |
338 | } | 342 | } |
@@ -361,6 +365,7 @@ void pm_qos_update_request(struct pm_qos_request *req, | |||
361 | 365 | ||
362 | cancel_delayed_work_sync(&req->work); | 366 | cancel_delayed_work_sync(&req->work); |
363 | 367 | ||
368 | trace_pm_qos_update_request(req->pm_qos_class, new_value); | ||
364 | if (new_value != req->node.prio) | 369 | if (new_value != req->node.prio) |
365 | pm_qos_update_target( | 370 | pm_qos_update_target( |
366 | pm_qos_array[req->pm_qos_class]->constraints, | 371 | pm_qos_array[req->pm_qos_class]->constraints, |
@@ -387,6 +392,8 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, | |||
387 | 392 | ||
388 | cancel_delayed_work_sync(&req->work); | 393 | cancel_delayed_work_sync(&req->work); |
389 | 394 | ||
395 | trace_pm_qos_update_request_timeout(req->pm_qos_class, | ||
396 | new_value, timeout_us); | ||
390 | if (new_value != req->node.prio) | 397 | if (new_value != req->node.prio) |
391 | pm_qos_update_target( | 398 | pm_qos_update_target( |
392 | pm_qos_array[req->pm_qos_class]->constraints, | 399 | pm_qos_array[req->pm_qos_class]->constraints, |
@@ -416,6 +423,7 @@ void pm_qos_remove_request(struct pm_qos_request *req) | |||
416 | 423 | ||
417 | cancel_delayed_work_sync(&req->work); | 424 | cancel_delayed_work_sync(&req->work); |
418 | 425 | ||
426 | trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE); | ||
419 | pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, | 427 | pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, |
420 | &req->node, PM_QOS_REMOVE_REQ, | 428 | &req->node, PM_QOS_REMOVE_REQ, |
421 | PM_QOS_DEFAULT_VALUE); | 429 | PM_QOS_DEFAULT_VALUE); |
@@ -477,7 +485,7 @@ static int find_pm_qos_object_by_minor(int minor) | |||
477 | { | 485 | { |
478 | int pm_qos_class; | 486 | int pm_qos_class; |
479 | 487 | ||
480 | for (pm_qos_class = 0; | 488 | for (pm_qos_class = PM_QOS_CPU_DMA_LATENCY; |
481 | pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { | 489 | pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { |
482 | if (minor == | 490 | if (minor == |
483 | pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) | 491 | pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) |
@@ -491,7 +499,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) | |||
491 | long pm_qos_class; | 499 | long pm_qos_class; |
492 | 500 | ||
493 | pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); | 501 | pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); |
494 | if (pm_qos_class >= 0) { | 502 | if (pm_qos_class >= PM_QOS_CPU_DMA_LATENCY) { |
495 | struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL); | 503 | struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL); |
496 | if (!req) | 504 | if (!req) |
497 | return -ENOMEM; | 505 | return -ENOMEM; |
@@ -584,7 +592,7 @@ static int __init pm_qos_power_init(void) | |||
584 | 592 | ||
585 | BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); | 593 | BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); |
586 | 594 | ||
587 | for (i = 1; i < PM_QOS_NUM_CLASSES; i++) { | 595 | for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { |
588 | ret = register_pm_qos_misc(pm_qos_array[i]); | 596 | ret = register_pm_qos_misc(pm_qos_array[i]); |
589 | if (ret < 0) { | 597 | if (ret < 0) { |
590 | printk(KERN_ERR "pm_qos_param: %s setup failed\n", | 598 | printk(KERN_ERR "pm_qos_param: %s setup failed\n", |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0de28576807d..349587bb03e1 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -642,8 +642,9 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, | |||
642 | region->end_pfn = end_pfn; | 642 | region->end_pfn = end_pfn; |
643 | list_add_tail(®ion->list, &nosave_regions); | 643 | list_add_tail(®ion->list, &nosave_regions); |
644 | Report: | 644 | Report: |
645 | printk(KERN_INFO "PM: Registered nosave memory: %016lx - %016lx\n", | 645 | printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n", |
646 | start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); | 646 | (unsigned long long) start_pfn << PAGE_SHIFT, |
647 | ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); | ||
647 | } | 648 | } |
648 | 649 | ||
649 | /* | 650 | /* |
@@ -1651,7 +1652,7 @@ unsigned long snapshot_get_image_size(void) | |||
1651 | static int init_header(struct swsusp_info *info) | 1652 | static int init_header(struct swsusp_info *info) |
1652 | { | 1653 | { |
1653 | memset(info, 0, sizeof(struct swsusp_info)); | 1654 | memset(info, 0, sizeof(struct swsusp_info)); |
1654 | info->num_physpages = num_physpages; | 1655 | info->num_physpages = get_num_physpages(); |
1655 | info->image_pages = nr_copy_pages; | 1656 | info->image_pages = nr_copy_pages; |
1656 | info->pages = snapshot_get_image_size(); | 1657 | info->pages = snapshot_get_image_size(); |
1657 | info->size = info->pages; | 1658 | info->size = info->pages; |
@@ -1795,7 +1796,7 @@ static int check_header(struct swsusp_info *info) | |||
1795 | char *reason; | 1796 | char *reason; |
1796 | 1797 | ||
1797 | reason = check_image_kernel(info); | 1798 | reason = check_image_kernel(info); |
1798 | if (!reason && info->num_physpages != num_physpages) | 1799 | if (!reason && info->num_physpages != get_num_physpages()) |
1799 | reason = "memory size"; | 1800 | reason = "memory size"; |
1800 | if (reason) { | 1801 | if (reason) { |
1801 | printk(KERN_ERR "PM: Image mismatch: %s\n", reason); | 1802 | printk(KERN_ERR "PM: Image mismatch: %s\n", reason); |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index bef86d121eb2..ece04223bb1e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -269,7 +269,7 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
269 | suspend_test_start(); | 269 | suspend_test_start(); |
270 | error = dpm_suspend_start(PMSG_SUSPEND); | 270 | error = dpm_suspend_start(PMSG_SUSPEND); |
271 | if (error) { | 271 | if (error) { |
272 | printk(KERN_ERR "PM: Some devices failed to suspend\n"); | 272 | pr_err("PM: Some devices failed to suspend, or early wake event detected\n"); |
273 | goto Recover_platform; | 273 | goto Recover_platform; |
274 | } | 274 | } |
275 | suspend_test_finish("suspend devices"); | 275 | suspend_test_finish("suspend devices"); |
diff --git a/kernel/printk.c b/kernel/printk.c index 8212c1aef125..d37d45c90ae6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -1369,9 +1369,9 @@ static int console_trylock_for_printk(unsigned int cpu) | |||
1369 | } | 1369 | } |
1370 | } | 1370 | } |
1371 | logbuf_cpu = UINT_MAX; | 1371 | logbuf_cpu = UINT_MAX; |
1372 | raw_spin_unlock(&logbuf_lock); | ||
1372 | if (wake) | 1373 | if (wake) |
1373 | up(&console_sem); | 1374 | up(&console_sem); |
1374 | raw_spin_unlock(&logbuf_lock); | ||
1375 | return retval; | 1375 | return retval; |
1376 | } | 1376 | } |
1377 | 1377 | ||
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 335a7ae697f5..4041f5747e73 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -469,6 +469,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) | |||
469 | /* Architecture-specific hardware disable .. */ | 469 | /* Architecture-specific hardware disable .. */ |
470 | ptrace_disable(child); | 470 | ptrace_disable(child); |
471 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | 471 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); |
472 | flush_ptrace_hw_breakpoint(child); | ||
472 | 473 | ||
473 | write_lock_irq(&tasklist_lock); | 474 | write_lock_irq(&tasklist_lock); |
474 | /* | 475 | /* |
@@ -844,6 +845,47 @@ int ptrace_request(struct task_struct *child, long request, | |||
844 | ret = ptrace_setsiginfo(child, &siginfo); | 845 | ret = ptrace_setsiginfo(child, &siginfo); |
845 | break; | 846 | break; |
846 | 847 | ||
848 | case PTRACE_GETSIGMASK: | ||
849 | if (addr != sizeof(sigset_t)) { | ||
850 | ret = -EINVAL; | ||
851 | break; | ||
852 | } | ||
853 | |||
854 | if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t))) | ||
855 | ret = -EFAULT; | ||
856 | else | ||
857 | ret = 0; | ||
858 | |||
859 | break; | ||
860 | |||
861 | case PTRACE_SETSIGMASK: { | ||
862 | sigset_t new_set; | ||
863 | |||
864 | if (addr != sizeof(sigset_t)) { | ||
865 | ret = -EINVAL; | ||
866 | break; | ||
867 | } | ||
868 | |||
869 | if (copy_from_user(&new_set, datavp, sizeof(sigset_t))) { | ||
870 | ret = -EFAULT; | ||
871 | break; | ||
872 | } | ||
873 | |||
874 | sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); | ||
875 | |||
876 | /* | ||
877 | * Every thread does recalc_sigpending() after resume, so | ||
878 | * retarget_shared_pending() and recalc_sigpending() are not | ||
879 | * called here. | ||
880 | */ | ||
881 | spin_lock_irq(&child->sighand->siglock); | ||
882 | child->blocked = new_set; | ||
883 | spin_unlock_irq(&child->sighand->siglock); | ||
884 | |||
885 | ret = 0; | ||
886 | break; | ||
887 | } | ||
888 | |||
847 | case PTRACE_INTERRUPT: | 889 | case PTRACE_INTERRUPT: |
848 | /* | 890 | /* |
849 | * Stop tracee without any side-effect on signal or job | 891 | * Stop tracee without any side-effect on signal or job |
@@ -948,8 +990,7 @@ int ptrace_request(struct task_struct *child, long request, | |||
948 | 990 | ||
949 | #ifdef CONFIG_HAVE_ARCH_TRACEHOOK | 991 | #ifdef CONFIG_HAVE_ARCH_TRACEHOOK |
950 | case PTRACE_GETREGSET: | 992 | case PTRACE_GETREGSET: |
951 | case PTRACE_SETREGSET: | 993 | case PTRACE_SETREGSET: { |
952 | { | ||
953 | struct iovec kiov; | 994 | struct iovec kiov; |
954 | struct iovec __user *uiov = datavp; | 995 | struct iovec __user *uiov = datavp; |
955 | 996 | ||
@@ -1181,19 +1222,3 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, | |||
1181 | return ret; | 1222 | return ret; |
1182 | } | 1223 | } |
1183 | #endif /* CONFIG_COMPAT */ | 1224 | #endif /* CONFIG_COMPAT */ |
1184 | |||
1185 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | ||
1186 | int ptrace_get_breakpoints(struct task_struct *tsk) | ||
1187 | { | ||
1188 | if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt)) | ||
1189 | return 0; | ||
1190 | |||
1191 | return -1; | ||
1192 | } | ||
1193 | |||
1194 | void ptrace_put_breakpoints(struct task_struct *tsk) | ||
1195 | { | ||
1196 | if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt)) | ||
1197 | flush_ptrace_hw_breakpoint(tsk); | ||
1198 | } | ||
1199 | #endif /* CONFIG_HAVE_HW_BREAKPOINT */ | ||
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index cf3adc6fe001..e08abb9461ac 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -3026,7 +3026,7 @@ static int __init rcu_spawn_gp_kthread(void) | |||
3026 | struct task_struct *t; | 3026 | struct task_struct *t; |
3027 | 3027 | ||
3028 | for_each_rcu_flavor(rsp) { | 3028 | for_each_rcu_flavor(rsp) { |
3029 | t = kthread_run(rcu_gp_kthread, rsp, rsp->name); | 3029 | t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); |
3030 | BUG_ON(IS_ERR(t)); | 3030 | BUG_ON(IS_ERR(t)); |
3031 | rnp = rcu_get_root(rsp); | 3031 | rnp = rcu_get_root(rsp); |
3032 | raw_spin_lock_irqsave(&rnp->lock, flags); | 3032 | raw_spin_lock_irqsave(&rnp->lock, flags); |
diff --git a/kernel/reboot.c b/kernel/reboot.c new file mode 100644 index 000000000000..269ed9384cc4 --- /dev/null +++ b/kernel/reboot.c | |||
@@ -0,0 +1,419 @@ | |||
1 | /* | ||
2 | * linux/kernel/reboot.c | ||
3 | * | ||
4 | * Copyright (C) 2013 Linus Torvalds | ||
5 | */ | ||
6 | |||
7 | #define pr_fmt(fmt) "reboot: " fmt | ||
8 | |||
9 | #include <linux/ctype.h> | ||
10 | #include <linux/export.h> | ||
11 | #include <linux/kexec.h> | ||
12 | #include <linux/kmod.h> | ||
13 | #include <linux/kmsg_dump.h> | ||
14 | #include <linux/reboot.h> | ||
15 | #include <linux/suspend.h> | ||
16 | #include <linux/syscalls.h> | ||
17 | #include <linux/syscore_ops.h> | ||
18 | #include <linux/uaccess.h> | ||
19 | |||
20 | /* | ||
21 | * this indicates whether you can reboot with ctrl-alt-del: the default is yes | ||
22 | */ | ||
23 | |||
24 | int C_A_D = 1; | ||
25 | struct pid *cad_pid; | ||
26 | EXPORT_SYMBOL(cad_pid); | ||
27 | |||
28 | #if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32) | ||
29 | #define DEFAULT_REBOOT_MODE = REBOOT_HARD | ||
30 | #else | ||
31 | #define DEFAULT_REBOOT_MODE | ||
32 | #endif | ||
33 | enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; | ||
34 | |||
35 | int reboot_default; | ||
36 | int reboot_cpu; | ||
37 | enum reboot_type reboot_type = BOOT_ACPI; | ||
38 | int reboot_force; | ||
39 | |||
40 | /* | ||
41 | * If set, this is used for preparing the system to power off. | ||
42 | */ | ||
43 | |||
44 | void (*pm_power_off_prepare)(void); | ||
45 | |||
46 | /** | ||
47 | * emergency_restart - reboot the system | ||
48 | * | ||
49 | * Without shutting down any hardware or taking any locks | ||
50 | * reboot the system. This is called when we know we are in | ||
51 | * trouble so this is our best effort to reboot. This is | ||
52 | * safe to call in interrupt context. | ||
53 | */ | ||
54 | void emergency_restart(void) | ||
55 | { | ||
56 | kmsg_dump(KMSG_DUMP_EMERG); | ||
57 | machine_emergency_restart(); | ||
58 | } | ||
59 | EXPORT_SYMBOL_GPL(emergency_restart); | ||
60 | |||
61 | void kernel_restart_prepare(char *cmd) | ||
62 | { | ||
63 | blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); | ||
64 | system_state = SYSTEM_RESTART; | ||
65 | usermodehelper_disable(); | ||
66 | device_shutdown(); | ||
67 | } | ||
68 | |||
69 | /** | ||
70 | * register_reboot_notifier - Register function to be called at reboot time | ||
71 | * @nb: Info about notifier function to be called | ||
72 | * | ||
73 | * Registers a function with the list of functions | ||
74 | * to be called at reboot time. | ||
75 | * | ||
76 | * Currently always returns zero, as blocking_notifier_chain_register() | ||
77 | * always returns zero. | ||
78 | */ | ||
79 | int register_reboot_notifier(struct notifier_block *nb) | ||
80 | { | ||
81 | return blocking_notifier_chain_register(&reboot_notifier_list, nb); | ||
82 | } | ||
83 | EXPORT_SYMBOL(register_reboot_notifier); | ||
84 | |||
85 | /** | ||
86 | * unregister_reboot_notifier - Unregister previously registered reboot notifier | ||
87 | * @nb: Hook to be unregistered | ||
88 | * | ||
89 | * Unregisters a previously registered reboot | ||
90 | * notifier function. | ||
91 | * | ||
92 | * Returns zero on success, or %-ENOENT on failure. | ||
93 | */ | ||
94 | int unregister_reboot_notifier(struct notifier_block *nb) | ||
95 | { | ||
96 | return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); | ||
97 | } | ||
98 | EXPORT_SYMBOL(unregister_reboot_notifier); | ||
99 | |||
100 | static void migrate_to_reboot_cpu(void) | ||
101 | { | ||
102 | /* The boot cpu is always logical cpu 0 */ | ||
103 | int cpu = reboot_cpu; | ||
104 | |||
105 | cpu_hotplug_disable(); | ||
106 | |||
107 | /* Make certain the cpu I'm about to reboot on is online */ | ||
108 | if (!cpu_online(cpu)) | ||
109 | cpu = cpumask_first(cpu_online_mask); | ||
110 | |||
111 | /* Prevent races with other tasks migrating this task */ | ||
112 | current->flags |= PF_NO_SETAFFINITY; | ||
113 | |||
114 | /* Make certain I only run on the appropriate processor */ | ||
115 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
116 | } | ||
117 | |||
118 | /** | ||
119 | * kernel_restart - reboot the system | ||
120 | * @cmd: pointer to buffer containing command to execute for restart | ||
121 | * or %NULL | ||
122 | * | ||
123 | * Shutdown everything and perform a clean reboot. | ||
124 | * This is not safe to call in interrupt context. | ||
125 | */ | ||
126 | void kernel_restart(char *cmd) | ||
127 | { | ||
128 | kernel_restart_prepare(cmd); | ||
129 | migrate_to_reboot_cpu(); | ||
130 | syscore_shutdown(); | ||
131 | if (!cmd) | ||
132 | pr_emerg("Restarting system\n"); | ||
133 | else | ||
134 | pr_emerg("Restarting system with command '%s'\n", cmd); | ||
135 | kmsg_dump(KMSG_DUMP_RESTART); | ||
136 | machine_restart(cmd); | ||
137 | } | ||
138 | EXPORT_SYMBOL_GPL(kernel_restart); | ||
139 | |||
140 | static void kernel_shutdown_prepare(enum system_states state) | ||
141 | { | ||
142 | blocking_notifier_call_chain(&reboot_notifier_list, | ||
143 | (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL); | ||
144 | system_state = state; | ||
145 | usermodehelper_disable(); | ||
146 | device_shutdown(); | ||
147 | } | ||
148 | /** | ||
149 | * kernel_halt - halt the system | ||
150 | * | ||
151 | * Shutdown everything and perform a clean system halt. | ||
152 | */ | ||
153 | void kernel_halt(void) | ||
154 | { | ||
155 | kernel_shutdown_prepare(SYSTEM_HALT); | ||
156 | migrate_to_reboot_cpu(); | ||
157 | syscore_shutdown(); | ||
158 | pr_emerg("System halted\n"); | ||
159 | kmsg_dump(KMSG_DUMP_HALT); | ||
160 | machine_halt(); | ||
161 | } | ||
162 | EXPORT_SYMBOL_GPL(kernel_halt); | ||
163 | |||
164 | /** | ||
165 | * kernel_power_off - power_off the system | ||
166 | * | ||
167 | * Shutdown everything and perform a clean system power_off. | ||
168 | */ | ||
169 | void kernel_power_off(void) | ||
170 | { | ||
171 | kernel_shutdown_prepare(SYSTEM_POWER_OFF); | ||
172 | if (pm_power_off_prepare) | ||
173 | pm_power_off_prepare(); | ||
174 | migrate_to_reboot_cpu(); | ||
175 | syscore_shutdown(); | ||
176 | pr_emerg("Power down\n"); | ||
177 | kmsg_dump(KMSG_DUMP_POWEROFF); | ||
178 | machine_power_off(); | ||
179 | } | ||
180 | EXPORT_SYMBOL_GPL(kernel_power_off); | ||
181 | |||
182 | static DEFINE_MUTEX(reboot_mutex); | ||
183 | |||
184 | /* | ||
185 | * Reboot system call: for obvious reasons only root may call it, | ||
186 | * and even root needs to set up some magic numbers in the registers | ||
187 | * so that some mistake won't make this reboot the whole machine. | ||
188 | * You can also set the meaning of the ctrl-alt-del-key here. | ||
189 | * | ||
190 | * reboot doesn't sync: do that yourself before calling this. | ||
191 | */ | ||
192 | SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, | ||
193 | void __user *, arg) | ||
194 | { | ||
195 | struct pid_namespace *pid_ns = task_active_pid_ns(current); | ||
196 | char buffer[256]; | ||
197 | int ret = 0; | ||
198 | |||
199 | /* We only trust the superuser with rebooting the system. */ | ||
200 | if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) | ||
201 | return -EPERM; | ||
202 | |||
203 | /* For safety, we require "magic" arguments. */ | ||
204 | if (magic1 != LINUX_REBOOT_MAGIC1 || | ||
205 | (magic2 != LINUX_REBOOT_MAGIC2 && | ||
206 | magic2 != LINUX_REBOOT_MAGIC2A && | ||
207 | magic2 != LINUX_REBOOT_MAGIC2B && | ||
208 | magic2 != LINUX_REBOOT_MAGIC2C)) | ||
209 | return -EINVAL; | ||
210 | |||
211 | /* | ||
212 | * If pid namespaces are enabled and the current task is in a child | ||
213 | * pid_namespace, the command is handled by reboot_pid_ns() which will | ||
214 | * call do_exit(). | ||
215 | */ | ||
216 | ret = reboot_pid_ns(pid_ns, cmd); | ||
217 | if (ret) | ||
218 | return ret; | ||
219 | |||
220 | /* Instead of trying to make the power_off code look like | ||
221 | * halt when pm_power_off is not set do it the easy way. | ||
222 | */ | ||
223 | if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) | ||
224 | cmd = LINUX_REBOOT_CMD_HALT; | ||
225 | |||
226 | mutex_lock(&reboot_mutex); | ||
227 | switch (cmd) { | ||
228 | case LINUX_REBOOT_CMD_RESTART: | ||
229 | kernel_restart(NULL); | ||
230 | break; | ||
231 | |||
232 | case LINUX_REBOOT_CMD_CAD_ON: | ||
233 | C_A_D = 1; | ||
234 | break; | ||
235 | |||
236 | case LINUX_REBOOT_CMD_CAD_OFF: | ||
237 | C_A_D = 0; | ||
238 | break; | ||
239 | |||
240 | case LINUX_REBOOT_CMD_HALT: | ||
241 | kernel_halt(); | ||
242 | do_exit(0); | ||
243 | panic("cannot halt"); | ||
244 | |||
245 | case LINUX_REBOOT_CMD_POWER_OFF: | ||
246 | kernel_power_off(); | ||
247 | do_exit(0); | ||
248 | break; | ||
249 | |||
250 | case LINUX_REBOOT_CMD_RESTART2: | ||
251 | ret = strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1); | ||
252 | if (ret < 0) { | ||
253 | ret = -EFAULT; | ||
254 | break; | ||
255 | } | ||
256 | buffer[sizeof(buffer) - 1] = '\0'; | ||
257 | |||
258 | kernel_restart(buffer); | ||
259 | break; | ||
260 | |||
261 | #ifdef CONFIG_KEXEC | ||
262 | case LINUX_REBOOT_CMD_KEXEC: | ||
263 | ret = kernel_kexec(); | ||
264 | break; | ||
265 | #endif | ||
266 | |||
267 | #ifdef CONFIG_HIBERNATION | ||
268 | case LINUX_REBOOT_CMD_SW_SUSPEND: | ||
269 | ret = hibernate(); | ||
270 | break; | ||
271 | #endif | ||
272 | |||
273 | default: | ||
274 | ret = -EINVAL; | ||
275 | break; | ||
276 | } | ||
277 | mutex_unlock(&reboot_mutex); | ||
278 | return ret; | ||
279 | } | ||
280 | |||
281 | static void deferred_cad(struct work_struct *dummy) | ||
282 | { | ||
283 | kernel_restart(NULL); | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * This function gets called by ctrl-alt-del - ie the keyboard interrupt. | ||
288 | * As it's called within an interrupt, it may NOT sync: the only choice | ||
289 | * is whether to reboot at once, or just ignore the ctrl-alt-del. | ||
290 | */ | ||
291 | void ctrl_alt_del(void) | ||
292 | { | ||
293 | static DECLARE_WORK(cad_work, deferred_cad); | ||
294 | |||
295 | if (C_A_D) | ||
296 | schedule_work(&cad_work); | ||
297 | else | ||
298 | kill_cad_pid(SIGINT, 1); | ||
299 | } | ||
300 | |||
301 | char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; | ||
302 | |||
303 | static int __orderly_poweroff(bool force) | ||
304 | { | ||
305 | char **argv; | ||
306 | static char *envp[] = { | ||
307 | "HOME=/", | ||
308 | "PATH=/sbin:/bin:/usr/sbin:/usr/bin", | ||
309 | NULL | ||
310 | }; | ||
311 | int ret; | ||
312 | |||
313 | argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); | ||
314 | if (argv) { | ||
315 | ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); | ||
316 | argv_free(argv); | ||
317 | } else { | ||
318 | ret = -ENOMEM; | ||
319 | } | ||
320 | |||
321 | if (ret && force) { | ||
322 | pr_warn("Failed to start orderly shutdown: forcing the issue\n"); | ||
323 | /* | ||
324 | * I guess this should try to kick off some daemon to sync and | ||
325 | * poweroff asap. Or not even bother syncing if we're doing an | ||
326 | * emergency shutdown? | ||
327 | */ | ||
328 | emergency_sync(); | ||
329 | kernel_power_off(); | ||
330 | } | ||
331 | |||
332 | return ret; | ||
333 | } | ||
334 | |||
335 | static bool poweroff_force; | ||
336 | |||
337 | static void poweroff_work_func(struct work_struct *work) | ||
338 | { | ||
339 | __orderly_poweroff(poweroff_force); | ||
340 | } | ||
341 | |||
342 | static DECLARE_WORK(poweroff_work, poweroff_work_func); | ||
343 | |||
344 | /** | ||
345 | * orderly_poweroff - Trigger an orderly system poweroff | ||
346 | * @force: force poweroff if command execution fails | ||
347 | * | ||
348 | * This may be called from any context to trigger a system shutdown. | ||
349 | * If the orderly shutdown fails, it will force an immediate shutdown. | ||
350 | */ | ||
351 | int orderly_poweroff(bool force) | ||
352 | { | ||
353 | if (force) /* do not override the pending "true" */ | ||
354 | poweroff_force = true; | ||
355 | schedule_work(&poweroff_work); | ||
356 | return 0; | ||
357 | } | ||
358 | EXPORT_SYMBOL_GPL(orderly_poweroff); | ||
359 | |||
360 | static int __init reboot_setup(char *str) | ||
361 | { | ||
362 | for (;;) { | ||
363 | /* | ||
364 | * Having anything passed on the command line via | ||
365 | * reboot= will cause us to disable DMI checking | ||
366 | * below. | ||
367 | */ | ||
368 | reboot_default = 0; | ||
369 | |||
370 | switch (*str) { | ||
371 | case 'w': | ||
372 | reboot_mode = REBOOT_WARM; | ||
373 | break; | ||
374 | |||
375 | case 'c': | ||
376 | reboot_mode = REBOOT_COLD; | ||
377 | break; | ||
378 | |||
379 | case 'h': | ||
380 | reboot_mode = REBOOT_HARD; | ||
381 | break; | ||
382 | |||
383 | case 's': | ||
384 | if (isdigit(*(str+1))) | ||
385 | reboot_cpu = simple_strtoul(str+1, NULL, 0); | ||
386 | else if (str[1] == 'm' && str[2] == 'p' && | ||
387 | isdigit(*(str+3))) | ||
388 | reboot_cpu = simple_strtoul(str+3, NULL, 0); | ||
389 | else | ||
390 | reboot_mode = REBOOT_SOFT; | ||
391 | break; | ||
392 | |||
393 | case 'g': | ||
394 | reboot_mode = REBOOT_GPIO; | ||
395 | break; | ||
396 | |||
397 | case 'b': | ||
398 | case 'a': | ||
399 | case 'k': | ||
400 | case 't': | ||
401 | case 'e': | ||
402 | case 'p': | ||
403 | reboot_type = *str; | ||
404 | break; | ||
405 | |||
406 | case 'f': | ||
407 | reboot_force = 1; | ||
408 | break; | ||
409 | } | ||
410 | |||
411 | str = strchr(str, ','); | ||
412 | if (str) | ||
413 | str++; | ||
414 | else | ||
415 | break; | ||
416 | } | ||
417 | return 1; | ||
418 | } | ||
419 | __setup("reboot=", reboot_setup); | ||
diff --git a/kernel/resource.c b/kernel/resource.c index 77bf11a86c7d..3f285dce9347 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -449,7 +449,6 @@ static int __find_resource(struct resource *root, struct resource *old, | |||
449 | struct resource *this = root->child; | 449 | struct resource *this = root->child; |
450 | struct resource tmp = *new, avail, alloc; | 450 | struct resource tmp = *new, avail, alloc; |
451 | 451 | ||
452 | tmp.flags = new->flags; | ||
453 | tmp.start = root->start; | 452 | tmp.start = root->start; |
454 | /* | 453 | /* |
455 | * Skip past an allocated resource that starts at 0, since the assignment | 454 | * Skip past an allocated resource that starts at 0, since the assignment |
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 17d7065c3872..5aef494fc8b4 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
@@ -162,6 +162,39 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) | |||
162 | */ | 162 | */ |
163 | 163 | ||
164 | /** | 164 | /** |
165 | * cputimer_running - return true if cputimer is running | ||
166 | * | ||
167 | * @tsk: Pointer to target task. | ||
168 | */ | ||
169 | static inline bool cputimer_running(struct task_struct *tsk) | ||
170 | |||
171 | { | ||
172 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; | ||
173 | |||
174 | if (!cputimer->running) | ||
175 | return false; | ||
176 | |||
177 | /* | ||
178 | * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime | ||
179 | * in __exit_signal(), we won't account to the signal struct further | ||
180 | * cputime consumed by that task, even though the task can still be | ||
181 | * ticking after __exit_signal(). | ||
182 | * | ||
183 | * In order to keep a consistent behaviour between thread group cputime | ||
184 | * and thread group cputimer accounting, lets also ignore the cputime | ||
185 | * elapsing after __exit_signal() in any thread group timer running. | ||
186 | * | ||
187 | * This makes sure that POSIX CPU clocks and timers are synchronized, so | ||
188 | * that a POSIX CPU timer won't expire while the corresponding POSIX CPU | ||
189 | * clock delta is behind the expiring timer value. | ||
190 | */ | ||
191 | if (unlikely(!tsk->sighand)) | ||
192 | return false; | ||
193 | |||
194 | return true; | ||
195 | } | ||
196 | |||
197 | /** | ||
165 | * account_group_user_time - Maintain utime for a thread group. | 198 | * account_group_user_time - Maintain utime for a thread group. |
166 | * | 199 | * |
167 | * @tsk: Pointer to task structure. | 200 | * @tsk: Pointer to task structure. |
@@ -176,7 +209,7 @@ static inline void account_group_user_time(struct task_struct *tsk, | |||
176 | { | 209 | { |
177 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; | 210 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
178 | 211 | ||
179 | if (!cputimer->running) | 212 | if (!cputimer_running(tsk)) |
180 | return; | 213 | return; |
181 | 214 | ||
182 | raw_spin_lock(&cputimer->lock); | 215 | raw_spin_lock(&cputimer->lock); |
@@ -199,7 +232,7 @@ static inline void account_group_system_time(struct task_struct *tsk, | |||
199 | { | 232 | { |
200 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; | 233 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
201 | 234 | ||
202 | if (!cputimer->running) | 235 | if (!cputimer_running(tsk)) |
203 | return; | 236 | return; |
204 | 237 | ||
205 | raw_spin_lock(&cputimer->lock); | 238 | raw_spin_lock(&cputimer->lock); |
@@ -222,7 +255,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, | |||
222 | { | 255 | { |
223 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; | 256 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
224 | 257 | ||
225 | if (!cputimer->running) | 258 | if (!cputimer_running(tsk)) |
226 | return; | 259 | return; |
227 | 260 | ||
228 | raw_spin_lock(&cputimer->lock); | 261 | raw_spin_lock(&cputimer->lock); |
diff --git a/kernel/signal.c b/kernel/signal.c index 113411bfe8b1..50e41075ac77 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -2848,7 +2848,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, | |||
2848 | recalc_sigpending(); | 2848 | recalc_sigpending(); |
2849 | spin_unlock_irq(&tsk->sighand->siglock); | 2849 | spin_unlock_irq(&tsk->sighand->siglock); |
2850 | 2850 | ||
2851 | timeout = schedule_timeout_interruptible(timeout); | 2851 | timeout = freezable_schedule_timeout_interruptible(timeout); |
2852 | 2852 | ||
2853 | spin_lock_irq(&tsk->sighand->siglock); | 2853 | spin_lock_irq(&tsk->sighand->siglock); |
2854 | __set_task_blocked(tsk, &tsk->real_blocked); | 2854 | __set_task_blocked(tsk, &tsk->real_blocked); |
diff --git a/kernel/sys.c b/kernel/sys.c index 2bbd9a73b54c..771129b299f8 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -116,20 +116,6 @@ EXPORT_SYMBOL(fs_overflowuid); | |||
116 | EXPORT_SYMBOL(fs_overflowgid); | 116 | EXPORT_SYMBOL(fs_overflowgid); |
117 | 117 | ||
118 | /* | 118 | /* |
119 | * this indicates whether you can reboot with ctrl-alt-del: the default is yes | ||
120 | */ | ||
121 | |||
122 | int C_A_D = 1; | ||
123 | struct pid *cad_pid; | ||
124 | EXPORT_SYMBOL(cad_pid); | ||
125 | |||
126 | /* | ||
127 | * If set, this is used for preparing the system to power off. | ||
128 | */ | ||
129 | |||
130 | void (*pm_power_off_prepare)(void); | ||
131 | |||
132 | /* | ||
133 | * Returns true if current's euid is same as p's uid or euid, | 119 | * Returns true if current's euid is same as p's uid or euid, |
134 | * or has CAP_SYS_NICE to p's user_ns. | 120 | * or has CAP_SYS_NICE to p's user_ns. |
135 | * | 121 | * |
@@ -308,266 +294,6 @@ out_unlock: | |||
308 | return retval; | 294 | return retval; |
309 | } | 295 | } |
310 | 296 | ||
311 | /** | ||
312 | * emergency_restart - reboot the system | ||
313 | * | ||
314 | * Without shutting down any hardware or taking any locks | ||
315 | * reboot the system. This is called when we know we are in | ||
316 | * trouble so this is our best effort to reboot. This is | ||
317 | * safe to call in interrupt context. | ||
318 | */ | ||
319 | void emergency_restart(void) | ||
320 | { | ||
321 | kmsg_dump(KMSG_DUMP_EMERG); | ||
322 | machine_emergency_restart(); | ||
323 | } | ||
324 | EXPORT_SYMBOL_GPL(emergency_restart); | ||
325 | |||
326 | void kernel_restart_prepare(char *cmd) | ||
327 | { | ||
328 | blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); | ||
329 | system_state = SYSTEM_RESTART; | ||
330 | usermodehelper_disable(); | ||
331 | device_shutdown(); | ||
332 | } | ||
333 | |||
334 | /** | ||
335 | * register_reboot_notifier - Register function to be called at reboot time | ||
336 | * @nb: Info about notifier function to be called | ||
337 | * | ||
338 | * Registers a function with the list of functions | ||
339 | * to be called at reboot time. | ||
340 | * | ||
341 | * Currently always returns zero, as blocking_notifier_chain_register() | ||
342 | * always returns zero. | ||
343 | */ | ||
344 | int register_reboot_notifier(struct notifier_block *nb) | ||
345 | { | ||
346 | return blocking_notifier_chain_register(&reboot_notifier_list, nb); | ||
347 | } | ||
348 | EXPORT_SYMBOL(register_reboot_notifier); | ||
349 | |||
350 | /** | ||
351 | * unregister_reboot_notifier - Unregister previously registered reboot notifier | ||
352 | * @nb: Hook to be unregistered | ||
353 | * | ||
354 | * Unregisters a previously registered reboot | ||
355 | * notifier function. | ||
356 | * | ||
357 | * Returns zero on success, or %-ENOENT on failure. | ||
358 | */ | ||
359 | int unregister_reboot_notifier(struct notifier_block *nb) | ||
360 | { | ||
361 | return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); | ||
362 | } | ||
363 | EXPORT_SYMBOL(unregister_reboot_notifier); | ||
364 | |||
365 | /* Add backwards compatibility for stable trees. */ | ||
366 | #ifndef PF_NO_SETAFFINITY | ||
367 | #define PF_NO_SETAFFINITY PF_THREAD_BOUND | ||
368 | #endif | ||
369 | |||
370 | static void migrate_to_reboot_cpu(void) | ||
371 | { | ||
372 | /* The boot cpu is always logical cpu 0 */ | ||
373 | int cpu = 0; | ||
374 | |||
375 | cpu_hotplug_disable(); | ||
376 | |||
377 | /* Make certain the cpu I'm about to reboot on is online */ | ||
378 | if (!cpu_online(cpu)) | ||
379 | cpu = cpumask_first(cpu_online_mask); | ||
380 | |||
381 | /* Prevent races with other tasks migrating this task */ | ||
382 | current->flags |= PF_NO_SETAFFINITY; | ||
383 | |||
384 | /* Make certain I only run on the appropriate processor */ | ||
385 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
386 | } | ||
387 | |||
388 | /** | ||
389 | * kernel_restart - reboot the system | ||
390 | * @cmd: pointer to buffer containing command to execute for restart | ||
391 | * or %NULL | ||
392 | * | ||
393 | * Shutdown everything and perform a clean reboot. | ||
394 | * This is not safe to call in interrupt context. | ||
395 | */ | ||
396 | void kernel_restart(char *cmd) | ||
397 | { | ||
398 | kernel_restart_prepare(cmd); | ||
399 | migrate_to_reboot_cpu(); | ||
400 | syscore_shutdown(); | ||
401 | if (!cmd) | ||
402 | printk(KERN_EMERG "Restarting system.\n"); | ||
403 | else | ||
404 | printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); | ||
405 | kmsg_dump(KMSG_DUMP_RESTART); | ||
406 | machine_restart(cmd); | ||
407 | } | ||
408 | EXPORT_SYMBOL_GPL(kernel_restart); | ||
409 | |||
410 | static void kernel_shutdown_prepare(enum system_states state) | ||
411 | { | ||
412 | blocking_notifier_call_chain(&reboot_notifier_list, | ||
413 | (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); | ||
414 | system_state = state; | ||
415 | usermodehelper_disable(); | ||
416 | device_shutdown(); | ||
417 | } | ||
418 | /** | ||
419 | * kernel_halt - halt the system | ||
420 | * | ||
421 | * Shutdown everything and perform a clean system halt. | ||
422 | */ | ||
423 | void kernel_halt(void) | ||
424 | { | ||
425 | kernel_shutdown_prepare(SYSTEM_HALT); | ||
426 | migrate_to_reboot_cpu(); | ||
427 | syscore_shutdown(); | ||
428 | printk(KERN_EMERG "System halted.\n"); | ||
429 | kmsg_dump(KMSG_DUMP_HALT); | ||
430 | machine_halt(); | ||
431 | } | ||
432 | |||
433 | EXPORT_SYMBOL_GPL(kernel_halt); | ||
434 | |||
435 | /** | ||
436 | * kernel_power_off - power_off the system | ||
437 | * | ||
438 | * Shutdown everything and perform a clean system power_off. | ||
439 | */ | ||
440 | void kernel_power_off(void) | ||
441 | { | ||
442 | kernel_shutdown_prepare(SYSTEM_POWER_OFF); | ||
443 | if (pm_power_off_prepare) | ||
444 | pm_power_off_prepare(); | ||
445 | migrate_to_reboot_cpu(); | ||
446 | syscore_shutdown(); | ||
447 | printk(KERN_EMERG "Power down.\n"); | ||
448 | kmsg_dump(KMSG_DUMP_POWEROFF); | ||
449 | machine_power_off(); | ||
450 | } | ||
451 | EXPORT_SYMBOL_GPL(kernel_power_off); | ||
452 | |||
453 | static DEFINE_MUTEX(reboot_mutex); | ||
454 | |||
455 | /* | ||
456 | * Reboot system call: for obvious reasons only root may call it, | ||
457 | * and even root needs to set up some magic numbers in the registers | ||
458 | * so that some mistake won't make this reboot the whole machine. | ||
459 | * You can also set the meaning of the ctrl-alt-del-key here. | ||
460 | * | ||
461 | * reboot doesn't sync: do that yourself before calling this. | ||
462 | */ | ||
463 | SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, | ||
464 | void __user *, arg) | ||
465 | { | ||
466 | struct pid_namespace *pid_ns = task_active_pid_ns(current); | ||
467 | char buffer[256]; | ||
468 | int ret = 0; | ||
469 | |||
470 | /* We only trust the superuser with rebooting the system. */ | ||
471 | if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) | ||
472 | return -EPERM; | ||
473 | |||
474 | /* For safety, we require "magic" arguments. */ | ||
475 | if (magic1 != LINUX_REBOOT_MAGIC1 || | ||
476 | (magic2 != LINUX_REBOOT_MAGIC2 && | ||
477 | magic2 != LINUX_REBOOT_MAGIC2A && | ||
478 | magic2 != LINUX_REBOOT_MAGIC2B && | ||
479 | magic2 != LINUX_REBOOT_MAGIC2C)) | ||
480 | return -EINVAL; | ||
481 | |||
482 | /* | ||
483 | * If pid namespaces are enabled and the current task is in a child | ||
484 | * pid_namespace, the command is handled by reboot_pid_ns() which will | ||
485 | * call do_exit(). | ||
486 | */ | ||
487 | ret = reboot_pid_ns(pid_ns, cmd); | ||
488 | if (ret) | ||
489 | return ret; | ||
490 | |||
491 | /* Instead of trying to make the power_off code look like | ||
492 | * halt when pm_power_off is not set do it the easy way. | ||
493 | */ | ||
494 | if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) | ||
495 | cmd = LINUX_REBOOT_CMD_HALT; | ||
496 | |||
497 | mutex_lock(&reboot_mutex); | ||
498 | switch (cmd) { | ||
499 | case LINUX_REBOOT_CMD_RESTART: | ||
500 | kernel_restart(NULL); | ||
501 | break; | ||
502 | |||
503 | case LINUX_REBOOT_CMD_CAD_ON: | ||
504 | C_A_D = 1; | ||
505 | break; | ||
506 | |||
507 | case LINUX_REBOOT_CMD_CAD_OFF: | ||
508 | C_A_D = 0; | ||
509 | break; | ||
510 | |||
511 | case LINUX_REBOOT_CMD_HALT: | ||
512 | kernel_halt(); | ||
513 | do_exit(0); | ||
514 | panic("cannot halt"); | ||
515 | |||
516 | case LINUX_REBOOT_CMD_POWER_OFF: | ||
517 | kernel_power_off(); | ||
518 | do_exit(0); | ||
519 | break; | ||
520 | |||
521 | case LINUX_REBOOT_CMD_RESTART2: | ||
522 | if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { | ||
523 | ret = -EFAULT; | ||
524 | break; | ||
525 | } | ||
526 | buffer[sizeof(buffer) - 1] = '\0'; | ||
527 | |||
528 | kernel_restart(buffer); | ||
529 | break; | ||
530 | |||
531 | #ifdef CONFIG_KEXEC | ||
532 | case LINUX_REBOOT_CMD_KEXEC: | ||
533 | ret = kernel_kexec(); | ||
534 | break; | ||
535 | #endif | ||
536 | |||
537 | #ifdef CONFIG_HIBERNATION | ||
538 | case LINUX_REBOOT_CMD_SW_SUSPEND: | ||
539 | ret = hibernate(); | ||
540 | break; | ||
541 | #endif | ||
542 | |||
543 | default: | ||
544 | ret = -EINVAL; | ||
545 | break; | ||
546 | } | ||
547 | mutex_unlock(&reboot_mutex); | ||
548 | return ret; | ||
549 | } | ||
550 | |||
551 | static void deferred_cad(struct work_struct *dummy) | ||
552 | { | ||
553 | kernel_restart(NULL); | ||
554 | } | ||
555 | |||
556 | /* | ||
557 | * This function gets called by ctrl-alt-del - ie the keyboard interrupt. | ||
558 | * As it's called within an interrupt, it may NOT sync: the only choice | ||
559 | * is whether to reboot at once, or just ignore the ctrl-alt-del. | ||
560 | */ | ||
561 | void ctrl_alt_del(void) | ||
562 | { | ||
563 | static DECLARE_WORK(cad_work, deferred_cad); | ||
564 | |||
565 | if (C_A_D) | ||
566 | schedule_work(&cad_work); | ||
567 | else | ||
568 | kill_cad_pid(SIGINT, 1); | ||
569 | } | ||
570 | |||
571 | /* | 297 | /* |
572 | * Unprivileged users may change the real gid to the effective gid | 298 | * Unprivileged users may change the real gid to the effective gid |
573 | * or vice versa. (BSD-style) | 299 | * or vice versa. (BSD-style) |
@@ -1309,6 +1035,17 @@ out: | |||
1309 | return retval; | 1035 | return retval; |
1310 | } | 1036 | } |
1311 | 1037 | ||
1038 | static void set_special_pids(struct pid *pid) | ||
1039 | { | ||
1040 | struct task_struct *curr = current->group_leader; | ||
1041 | |||
1042 | if (task_session(curr) != pid) | ||
1043 | change_pid(curr, PIDTYPE_SID, pid); | ||
1044 | |||
1045 | if (task_pgrp(curr) != pid) | ||
1046 | change_pid(curr, PIDTYPE_PGID, pid); | ||
1047 | } | ||
1048 | |||
1312 | SYSCALL_DEFINE0(setsid) | 1049 | SYSCALL_DEFINE0(setsid) |
1313 | { | 1050 | { |
1314 | struct task_struct *group_leader = current->group_leader; | 1051 | struct task_struct *group_leader = current->group_leader; |
@@ -1328,7 +1065,7 @@ SYSCALL_DEFINE0(setsid) | |||
1328 | goto out; | 1065 | goto out; |
1329 | 1066 | ||
1330 | group_leader->signal->leader = 1; | 1067 | group_leader->signal->leader = 1; |
1331 | __set_special_pids(sid); | 1068 | set_special_pids(sid); |
1332 | 1069 | ||
1333 | proc_clear_tty(group_leader); | 1070 | proc_clear_tty(group_leader); |
1334 | 1071 | ||
@@ -2281,68 +2018,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, | |||
2281 | return err ? -EFAULT : 0; | 2018 | return err ? -EFAULT : 0; |
2282 | } | 2019 | } |
2283 | 2020 | ||
2284 | char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; | ||
2285 | |||
2286 | static int __orderly_poweroff(bool force) | ||
2287 | { | ||
2288 | char **argv; | ||
2289 | static char *envp[] = { | ||
2290 | "HOME=/", | ||
2291 | "PATH=/sbin:/bin:/usr/sbin:/usr/bin", | ||
2292 | NULL | ||
2293 | }; | ||
2294 | int ret; | ||
2295 | |||
2296 | argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); | ||
2297 | if (argv) { | ||
2298 | ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); | ||
2299 | argv_free(argv); | ||
2300 | } else { | ||
2301 | printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", | ||
2302 | __func__, poweroff_cmd); | ||
2303 | ret = -ENOMEM; | ||
2304 | } | ||
2305 | |||
2306 | if (ret && force) { | ||
2307 | printk(KERN_WARNING "Failed to start orderly shutdown: " | ||
2308 | "forcing the issue\n"); | ||
2309 | /* | ||
2310 | * I guess this should try to kick off some daemon to sync and | ||
2311 | * poweroff asap. Or not even bother syncing if we're doing an | ||
2312 | * emergency shutdown? | ||
2313 | */ | ||
2314 | emergency_sync(); | ||
2315 | kernel_power_off(); | ||
2316 | } | ||
2317 | |||
2318 | return ret; | ||
2319 | } | ||
2320 | |||
2321 | static bool poweroff_force; | ||
2322 | |||
2323 | static void poweroff_work_func(struct work_struct *work) | ||
2324 | { | ||
2325 | __orderly_poweroff(poweroff_force); | ||
2326 | } | ||
2327 | |||
2328 | static DECLARE_WORK(poweroff_work, poweroff_work_func); | ||
2329 | |||
2330 | /** | ||
2331 | * orderly_poweroff - Trigger an orderly system poweroff | ||
2332 | * @force: force poweroff if command execution fails | ||
2333 | * | ||
2334 | * This may be called from any context to trigger a system shutdown. | ||
2335 | * If the orderly shutdown fails, it will force an immediate shutdown. | ||
2336 | */ | ||
2337 | int orderly_poweroff(bool force) | ||
2338 | { | ||
2339 | if (force) /* do not override the pending "true" */ | ||
2340 | poweroff_force = true; | ||
2341 | schedule_work(&poweroff_work); | ||
2342 | return 0; | ||
2343 | } | ||
2344 | EXPORT_SYMBOL_GPL(orderly_poweroff); | ||
2345 | |||
2346 | /** | 2021 | /** |
2347 | * do_sysinfo - fill in sysinfo struct | 2022 | * do_sysinfo - fill in sysinfo struct |
2348 | * @info: pointer to buffer to fill | 2023 | * @info: pointer to buffer to fill |
@@ -2355,8 +2030,7 @@ static int do_sysinfo(struct sysinfo *info) | |||
2355 | 2030 | ||
2356 | memset(info, 0, sizeof(struct sysinfo)); | 2031 | memset(info, 0, sizeof(struct sysinfo)); |
2357 | 2032 | ||
2358 | ktime_get_ts(&tp); | 2033 | get_monotonic_boottime(&tp); |
2359 | monotonic_to_bootbased(&tp); | ||
2360 | info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); | 2034 | info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); |
2361 | 2035 | ||
2362 | get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); | 2036 | get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5c9e33b5c0eb..ac09d98490aa 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -599,6 +599,13 @@ static struct ctl_table kern_table[] = { | |||
599 | .mode = 0644, | 599 | .mode = 0644, |
600 | .proc_handler = proc_dointvec, | 600 | .proc_handler = proc_dointvec, |
601 | }, | 601 | }, |
602 | { | ||
603 | .procname = "traceoff_on_warning", | ||
604 | .data = &__disable_trace_on_warning, | ||
605 | .maxlen = sizeof(__disable_trace_on_warning), | ||
606 | .mode = 0644, | ||
607 | .proc_handler = proc_dointvec, | ||
608 | }, | ||
602 | #endif | 609 | #endif |
603 | #ifdef CONFIG_MODULES | 610 | #ifdef CONFIG_MODULES |
604 | { | 611 | { |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index aea4a9ea6fc8..b609213ca9a2 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -3,7 +3,6 @@ | |||
3 | #include "../fs/xfs/xfs_sysctl.h" | 3 | #include "../fs/xfs/xfs_sysctl.h" |
4 | #include <linux/sunrpc/debug.h> | 4 | #include <linux/sunrpc/debug.h> |
5 | #include <linux/string.h> | 5 | #include <linux/string.h> |
6 | #include <net/ip_vs.h> | ||
7 | #include <linux/syscalls.h> | 6 | #include <linux/syscalls.h> |
8 | #include <linux/namei.h> | 7 | #include <linux/namei.h> |
9 | #include <linux/mount.h> | 8 | #include <linux/mount.h> |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ff7d9d2ab504..9250130646f5 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
@@ -4,6 +4,8 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o | |||
4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o | 4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o |
5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o | 5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o |
6 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o | 6 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o |
7 | obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o | ||
7 | obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o | 8 | obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o |
8 | obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o | 9 | obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o |
9 | obj-$(CONFIG_TIMER_STATS) += timer_stats.o | 10 | obj-$(CONFIG_TIMER_STATS) += timer_stats.o |
11 | obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o | ||
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index f11d83b12949..eec50fcef9e4 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -199,6 +199,13 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) | |||
199 | 199 | ||
200 | } | 200 | } |
201 | 201 | ||
202 | ktime_t alarm_expires_remaining(const struct alarm *alarm) | ||
203 | { | ||
204 | struct alarm_base *base = &alarm_bases[alarm->type]; | ||
205 | return ktime_sub(alarm->node.expires, base->gettime()); | ||
206 | } | ||
207 | EXPORT_SYMBOL_GPL(alarm_expires_remaining); | ||
208 | |||
202 | #ifdef CONFIG_RTC_CLASS | 209 | #ifdef CONFIG_RTC_CLASS |
203 | /** | 210 | /** |
204 | * alarmtimer_suspend - Suspend time callback | 211 | * alarmtimer_suspend - Suspend time callback |
@@ -303,9 +310,10 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, | |||
303 | alarm->type = type; | 310 | alarm->type = type; |
304 | alarm->state = ALARMTIMER_STATE_INACTIVE; | 311 | alarm->state = ALARMTIMER_STATE_INACTIVE; |
305 | } | 312 | } |
313 | EXPORT_SYMBOL_GPL(alarm_init); | ||
306 | 314 | ||
307 | /** | 315 | /** |
308 | * alarm_start - Sets an alarm to fire | 316 | * alarm_start - Sets an absolute alarm to fire |
309 | * @alarm: ptr to alarm to set | 317 | * @alarm: ptr to alarm to set |
310 | * @start: time to run the alarm | 318 | * @start: time to run the alarm |
311 | */ | 319 | */ |
@@ -323,6 +331,34 @@ int alarm_start(struct alarm *alarm, ktime_t start) | |||
323 | spin_unlock_irqrestore(&base->lock, flags); | 331 | spin_unlock_irqrestore(&base->lock, flags); |
324 | return ret; | 332 | return ret; |
325 | } | 333 | } |
334 | EXPORT_SYMBOL_GPL(alarm_start); | ||
335 | |||
336 | /** | ||
337 | * alarm_start_relative - Sets a relative alarm to fire | ||
338 | * @alarm: ptr to alarm to set | ||
339 | * @start: time relative to now to run the alarm | ||
340 | */ | ||
341 | int alarm_start_relative(struct alarm *alarm, ktime_t start) | ||
342 | { | ||
343 | struct alarm_base *base = &alarm_bases[alarm->type]; | ||
344 | |||
345 | start = ktime_add(start, base->gettime()); | ||
346 | return alarm_start(alarm, start); | ||
347 | } | ||
348 | EXPORT_SYMBOL_GPL(alarm_start_relative); | ||
349 | |||
350 | void alarm_restart(struct alarm *alarm) | ||
351 | { | ||
352 | struct alarm_base *base = &alarm_bases[alarm->type]; | ||
353 | unsigned long flags; | ||
354 | |||
355 | spin_lock_irqsave(&base->lock, flags); | ||
356 | hrtimer_set_expires(&alarm->timer, alarm->node.expires); | ||
357 | hrtimer_restart(&alarm->timer); | ||
358 | alarmtimer_enqueue(base, alarm); | ||
359 | spin_unlock_irqrestore(&base->lock, flags); | ||
360 | } | ||
361 | EXPORT_SYMBOL_GPL(alarm_restart); | ||
326 | 362 | ||
327 | /** | 363 | /** |
328 | * alarm_try_to_cancel - Tries to cancel an alarm timer | 364 | * alarm_try_to_cancel - Tries to cancel an alarm timer |
@@ -344,6 +380,7 @@ int alarm_try_to_cancel(struct alarm *alarm) | |||
344 | spin_unlock_irqrestore(&base->lock, flags); | 380 | spin_unlock_irqrestore(&base->lock, flags); |
345 | return ret; | 381 | return ret; |
346 | } | 382 | } |
383 | EXPORT_SYMBOL_GPL(alarm_try_to_cancel); | ||
347 | 384 | ||
348 | 385 | ||
349 | /** | 386 | /** |
@@ -361,6 +398,7 @@ int alarm_cancel(struct alarm *alarm) | |||
361 | cpu_relax(); | 398 | cpu_relax(); |
362 | } | 399 | } |
363 | } | 400 | } |
401 | EXPORT_SYMBOL_GPL(alarm_cancel); | ||
364 | 402 | ||
365 | 403 | ||
366 | u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) | 404 | u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) |
@@ -393,8 +431,15 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) | |||
393 | alarm->node.expires = ktime_add(alarm->node.expires, interval); | 431 | alarm->node.expires = ktime_add(alarm->node.expires, interval); |
394 | return overrun; | 432 | return overrun; |
395 | } | 433 | } |
434 | EXPORT_SYMBOL_GPL(alarm_forward); | ||
396 | 435 | ||
436 | u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) | ||
437 | { | ||
438 | struct alarm_base *base = &alarm_bases[alarm->type]; | ||
397 | 439 | ||
440 | return alarm_forward(alarm, base->gettime(), interval); | ||
441 | } | ||
442 | EXPORT_SYMBOL_GPL(alarm_forward_now); | ||
398 | 443 | ||
399 | 444 | ||
400 | /** | 445 | /** |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index c6d6400ee137..38959c866789 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -15,20 +15,23 @@ | |||
15 | #include <linux/hrtimer.h> | 15 | #include <linux/hrtimer.h> |
16 | #include <linux/init.h> | 16 | #include <linux/init.h> |
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/notifier.h> | ||
19 | #include <linux/smp.h> | 18 | #include <linux/smp.h> |
19 | #include <linux/device.h> | ||
20 | 20 | ||
21 | #include "tick-internal.h" | 21 | #include "tick-internal.h" |
22 | 22 | ||
23 | /* The registered clock event devices */ | 23 | /* The registered clock event devices */ |
24 | static LIST_HEAD(clockevent_devices); | 24 | static LIST_HEAD(clockevent_devices); |
25 | static LIST_HEAD(clockevents_released); | 25 | static LIST_HEAD(clockevents_released); |
26 | |||
27 | /* Notification for clock events */ | ||
28 | static RAW_NOTIFIER_HEAD(clockevents_chain); | ||
29 | |||
30 | /* Protection for the above */ | 26 | /* Protection for the above */ |
31 | static DEFINE_RAW_SPINLOCK(clockevents_lock); | 27 | static DEFINE_RAW_SPINLOCK(clockevents_lock); |
28 | /* Protection for unbind operations */ | ||
29 | static DEFINE_MUTEX(clockevents_mutex); | ||
30 | |||
31 | struct ce_unbind { | ||
32 | struct clock_event_device *ce; | ||
33 | int res; | ||
34 | }; | ||
32 | 35 | ||
33 | /** | 36 | /** |
34 | * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds | 37 | * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds |
@@ -232,47 +235,107 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, | |||
232 | return (rc && force) ? clockevents_program_min_delta(dev) : rc; | 235 | return (rc && force) ? clockevents_program_min_delta(dev) : rc; |
233 | } | 236 | } |
234 | 237 | ||
235 | /** | 238 | /* |
236 | * clockevents_register_notifier - register a clock events change listener | 239 | * Called after a notify add to make devices available which were |
240 | * released from the notifier call. | ||
237 | */ | 241 | */ |
238 | int clockevents_register_notifier(struct notifier_block *nb) | 242 | static void clockevents_notify_released(void) |
239 | { | 243 | { |
240 | unsigned long flags; | 244 | struct clock_event_device *dev; |
241 | int ret; | ||
242 | 245 | ||
243 | raw_spin_lock_irqsave(&clockevents_lock, flags); | 246 | while (!list_empty(&clockevents_released)) { |
244 | ret = raw_notifier_chain_register(&clockevents_chain, nb); | 247 | dev = list_entry(clockevents_released.next, |
245 | raw_spin_unlock_irqrestore(&clockevents_lock, flags); | 248 | struct clock_event_device, list); |
249 | list_del(&dev->list); | ||
250 | list_add(&dev->list, &clockevent_devices); | ||
251 | tick_check_new_device(dev); | ||
252 | } | ||
253 | } | ||
246 | 254 | ||
247 | return ret; | 255 | /* |
256 | * Try to install a replacement clock event device | ||
257 | */ | ||
258 | static int clockevents_replace(struct clock_event_device *ced) | ||
259 | { | ||
260 | struct clock_event_device *dev, *newdev = NULL; | ||
261 | |||
262 | list_for_each_entry(dev, &clockevent_devices, list) { | ||
263 | if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED) | ||
264 | continue; | ||
265 | |||
266 | if (!tick_check_replacement(newdev, dev)) | ||
267 | continue; | ||
268 | |||
269 | if (!try_module_get(dev->owner)) | ||
270 | continue; | ||
271 | |||
272 | if (newdev) | ||
273 | module_put(newdev->owner); | ||
274 | newdev = dev; | ||
275 | } | ||
276 | if (newdev) { | ||
277 | tick_install_replacement(newdev); | ||
278 | list_del_init(&ced->list); | ||
279 | } | ||
280 | return newdev ? 0 : -EBUSY; | ||
248 | } | 281 | } |
249 | 282 | ||
250 | /* | 283 | /* |
251 | * Notify about a clock event change. Called with clockevents_lock | 284 | * Called with clockevents_mutex and clockevents_lock held |
252 | * held. | ||
253 | */ | 285 | */ |
254 | static void clockevents_do_notify(unsigned long reason, void *dev) | 286 | static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) |
255 | { | 287 | { |
256 | raw_notifier_call_chain(&clockevents_chain, reason, dev); | 288 | /* Fast track. Device is unused */ |
289 | if (ced->mode == CLOCK_EVT_MODE_UNUSED) { | ||
290 | list_del_init(&ced->list); | ||
291 | return 0; | ||
292 | } | ||
293 | |||
294 | return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY; | ||
257 | } | 295 | } |
258 | 296 | ||
259 | /* | 297 | /* |
260 | * Called after a notify add to make devices available which were | 298 | * SMP function call to unbind a device |
261 | * released from the notifier call. | ||
262 | */ | 299 | */ |
263 | static void clockevents_notify_released(void) | 300 | static void __clockevents_unbind(void *arg) |
264 | { | 301 | { |
265 | struct clock_event_device *dev; | 302 | struct ce_unbind *cu = arg; |
303 | int res; | ||
304 | |||
305 | raw_spin_lock(&clockevents_lock); | ||
306 | res = __clockevents_try_unbind(cu->ce, smp_processor_id()); | ||
307 | if (res == -EAGAIN) | ||
308 | res = clockevents_replace(cu->ce); | ||
309 | cu->res = res; | ||
310 | raw_spin_unlock(&clockevents_lock); | ||
311 | } | ||
266 | 312 | ||
267 | while (!list_empty(&clockevents_released)) { | 313 | /* |
268 | dev = list_entry(clockevents_released.next, | 314 | * Issues smp function call to unbind a per cpu device. Called with |
269 | struct clock_event_device, list); | 315 | * clockevents_mutex held. |
270 | list_del(&dev->list); | 316 | */ |
271 | list_add(&dev->list, &clockevent_devices); | 317 | static int clockevents_unbind(struct clock_event_device *ced, int cpu) |
272 | clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); | 318 | { |
273 | } | 319 | struct ce_unbind cu = { .ce = ced, .res = -ENODEV }; |
320 | |||
321 | smp_call_function_single(cpu, __clockevents_unbind, &cu, 1); | ||
322 | return cu.res; | ||
274 | } | 323 | } |
275 | 324 | ||
325 | /* | ||
326 | * Unbind a clockevents device. | ||
327 | */ | ||
328 | int clockevents_unbind_device(struct clock_event_device *ced, int cpu) | ||
329 | { | ||
330 | int ret; | ||
331 | |||
332 | mutex_lock(&clockevents_mutex); | ||
333 | ret = clockevents_unbind(ced, cpu); | ||
334 | mutex_unlock(&clockevents_mutex); | ||
335 | return ret; | ||
336 | } | ||
337 | EXPORT_SYMBOL_GPL(clockevents_unbind); | ||
338 | |||
276 | /** | 339 | /** |
277 | * clockevents_register_device - register a clock event device | 340 | * clockevents_register_device - register a clock event device |
278 | * @dev: device to register | 341 | * @dev: device to register |
@@ -290,7 +353,7 @@ void clockevents_register_device(struct clock_event_device *dev) | |||
290 | raw_spin_lock_irqsave(&clockevents_lock, flags); | 353 | raw_spin_lock_irqsave(&clockevents_lock, flags); |
291 | 354 | ||
292 | list_add(&dev->list, &clockevent_devices); | 355 | list_add(&dev->list, &clockevent_devices); |
293 | clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); | 356 | tick_check_new_device(dev); |
294 | clockevents_notify_released(); | 357 | clockevents_notify_released(); |
295 | 358 | ||
296 | raw_spin_unlock_irqrestore(&clockevents_lock, flags); | 359 | raw_spin_unlock_irqrestore(&clockevents_lock, flags); |
@@ -386,6 +449,7 @@ void clockevents_exchange_device(struct clock_event_device *old, | |||
386 | * released list and do a notify add later. | 449 | * released list and do a notify add later. |
387 | */ | 450 | */ |
388 | if (old) { | 451 | if (old) { |
452 | module_put(old->owner); | ||
389 | clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); | 453 | clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); |
390 | list_del(&old->list); | 454 | list_del(&old->list); |
391 | list_add(&old->list, &clockevents_released); | 455 | list_add(&old->list, &clockevents_released); |
@@ -433,10 +497,36 @@ void clockevents_notify(unsigned long reason, void *arg) | |||
433 | int cpu; | 497 | int cpu; |
434 | 498 | ||
435 | raw_spin_lock_irqsave(&clockevents_lock, flags); | 499 | raw_spin_lock_irqsave(&clockevents_lock, flags); |
436 | clockevents_do_notify(reason, arg); | ||
437 | 500 | ||
438 | switch (reason) { | 501 | switch (reason) { |
502 | case CLOCK_EVT_NOTIFY_BROADCAST_ON: | ||
503 | case CLOCK_EVT_NOTIFY_BROADCAST_OFF: | ||
504 | case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: | ||
505 | tick_broadcast_on_off(reason, arg); | ||
506 | break; | ||
507 | |||
508 | case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: | ||
509 | case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: | ||
510 | tick_broadcast_oneshot_control(reason); | ||
511 | break; | ||
512 | |||
513 | case CLOCK_EVT_NOTIFY_CPU_DYING: | ||
514 | tick_handover_do_timer(arg); | ||
515 | break; | ||
516 | |||
517 | case CLOCK_EVT_NOTIFY_SUSPEND: | ||
518 | tick_suspend(); | ||
519 | tick_suspend_broadcast(); | ||
520 | break; | ||
521 | |||
522 | case CLOCK_EVT_NOTIFY_RESUME: | ||
523 | tick_resume(); | ||
524 | break; | ||
525 | |||
439 | case CLOCK_EVT_NOTIFY_CPU_DEAD: | 526 | case CLOCK_EVT_NOTIFY_CPU_DEAD: |
527 | tick_shutdown_broadcast_oneshot(arg); | ||
528 | tick_shutdown_broadcast(arg); | ||
529 | tick_shutdown(arg); | ||
440 | /* | 530 | /* |
441 | * Unregister the clock event devices which were | 531 | * Unregister the clock event devices which were |
442 | * released from the users in the notify chain. | 532 | * released from the users in the notify chain. |
@@ -462,4 +552,123 @@ void clockevents_notify(unsigned long reason, void *arg) | |||
462 | raw_spin_unlock_irqrestore(&clockevents_lock, flags); | 552 | raw_spin_unlock_irqrestore(&clockevents_lock, flags); |
463 | } | 553 | } |
464 | EXPORT_SYMBOL_GPL(clockevents_notify); | 554 | EXPORT_SYMBOL_GPL(clockevents_notify); |
555 | |||
556 | #ifdef CONFIG_SYSFS | ||
557 | struct bus_type clockevents_subsys = { | ||
558 | .name = "clockevents", | ||
559 | .dev_name = "clockevent", | ||
560 | }; | ||
561 | |||
562 | static DEFINE_PER_CPU(struct device, tick_percpu_dev); | ||
563 | static struct tick_device *tick_get_tick_dev(struct device *dev); | ||
564 | |||
565 | static ssize_t sysfs_show_current_tick_dev(struct device *dev, | ||
566 | struct device_attribute *attr, | ||
567 | char *buf) | ||
568 | { | ||
569 | struct tick_device *td; | ||
570 | ssize_t count = 0; | ||
571 | |||
572 | raw_spin_lock_irq(&clockevents_lock); | ||
573 | td = tick_get_tick_dev(dev); | ||
574 | if (td && td->evtdev) | ||
575 | count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name); | ||
576 | raw_spin_unlock_irq(&clockevents_lock); | ||
577 | return count; | ||
578 | } | ||
579 | static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL); | ||
580 | |||
581 | /* We don't support the abomination of removable broadcast devices */ | ||
582 | static ssize_t sysfs_unbind_tick_dev(struct device *dev, | ||
583 | struct device_attribute *attr, | ||
584 | const char *buf, size_t count) | ||
585 | { | ||
586 | char name[CS_NAME_LEN]; | ||
587 | size_t ret = sysfs_get_uname(buf, name, count); | ||
588 | struct clock_event_device *ce; | ||
589 | |||
590 | if (ret < 0) | ||
591 | return ret; | ||
592 | |||
593 | ret = -ENODEV; | ||
594 | mutex_lock(&clockevents_mutex); | ||
595 | raw_spin_lock_irq(&clockevents_lock); | ||
596 | list_for_each_entry(ce, &clockevent_devices, list) { | ||
597 | if (!strcmp(ce->name, name)) { | ||
598 | ret = __clockevents_try_unbind(ce, dev->id); | ||
599 | break; | ||
600 | } | ||
601 | } | ||
602 | raw_spin_unlock_irq(&clockevents_lock); | ||
603 | /* | ||
604 | * We hold clockevents_mutex, so ce can't go away | ||
605 | */ | ||
606 | if (ret == -EAGAIN) | ||
607 | ret = clockevents_unbind(ce, dev->id); | ||
608 | mutex_unlock(&clockevents_mutex); | ||
609 | return ret ? ret : count; | ||
610 | } | ||
611 | static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev); | ||
612 | |||
613 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | ||
614 | static struct device tick_bc_dev = { | ||
615 | .init_name = "broadcast", | ||
616 | .id = 0, | ||
617 | .bus = &clockevents_subsys, | ||
618 | }; | ||
619 | |||
620 | static struct tick_device *tick_get_tick_dev(struct device *dev) | ||
621 | { | ||
622 | return dev == &tick_bc_dev ? tick_get_broadcast_device() : | ||
623 | &per_cpu(tick_cpu_device, dev->id); | ||
624 | } | ||
625 | |||
626 | static __init int tick_broadcast_init_sysfs(void) | ||
627 | { | ||
628 | int err = device_register(&tick_bc_dev); | ||
629 | |||
630 | if (!err) | ||
631 | err = device_create_file(&tick_bc_dev, &dev_attr_current_device); | ||
632 | return err; | ||
633 | } | ||
634 | #else | ||
635 | static struct tick_device *tick_get_tick_dev(struct device *dev) | ||
636 | { | ||
637 | return &per_cpu(tick_cpu_device, dev->id); | ||
638 | } | ||
639 | static inline int tick_broadcast_init_sysfs(void) { return 0; } | ||
465 | #endif | 640 | #endif |
641 | |||
642 | static int __init tick_init_sysfs(void) | ||
643 | { | ||
644 | int cpu; | ||
645 | |||
646 | for_each_possible_cpu(cpu) { | ||
647 | struct device *dev = &per_cpu(tick_percpu_dev, cpu); | ||
648 | int err; | ||
649 | |||
650 | dev->id = cpu; | ||
651 | dev->bus = &clockevents_subsys; | ||
652 | err = device_register(dev); | ||
653 | if (!err) | ||
654 | err = device_create_file(dev, &dev_attr_current_device); | ||
655 | if (!err) | ||
656 | err = device_create_file(dev, &dev_attr_unbind_device); | ||
657 | if (err) | ||
658 | return err; | ||
659 | } | ||
660 | return tick_broadcast_init_sysfs(); | ||
661 | } | ||
662 | |||
663 | static int __init clockevents_init_sysfs(void) | ||
664 | { | ||
665 | int err = subsys_system_register(&clockevents_subsys, NULL); | ||
666 | |||
667 | if (!err) | ||
668 | err = tick_init_sysfs(); | ||
669 | return err; | ||
670 | } | ||
671 | device_initcall(clockevents_init_sysfs); | ||
672 | #endif /* SYSFS */ | ||
673 | |||
674 | #endif /* GENERIC_CLOCK_EVENTS */ | ||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c9583382141a..50a8736757f3 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -31,6 +31,8 @@ | |||
31 | #include <linux/tick.h> | 31 | #include <linux/tick.h> |
32 | #include <linux/kthread.h> | 32 | #include <linux/kthread.h> |
33 | 33 | ||
34 | #include "tick-internal.h" | ||
35 | |||
34 | void timecounter_init(struct timecounter *tc, | 36 | void timecounter_init(struct timecounter *tc, |
35 | const struct cyclecounter *cc, | 37 | const struct cyclecounter *cc, |
36 | u64 start_tstamp) | 38 | u64 start_tstamp) |
@@ -174,11 +176,12 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) | |||
174 | static struct clocksource *curr_clocksource; | 176 | static struct clocksource *curr_clocksource; |
175 | static LIST_HEAD(clocksource_list); | 177 | static LIST_HEAD(clocksource_list); |
176 | static DEFINE_MUTEX(clocksource_mutex); | 178 | static DEFINE_MUTEX(clocksource_mutex); |
177 | static char override_name[32]; | 179 | static char override_name[CS_NAME_LEN]; |
178 | static int finished_booting; | 180 | static int finished_booting; |
179 | 181 | ||
180 | #ifdef CONFIG_CLOCKSOURCE_WATCHDOG | 182 | #ifdef CONFIG_CLOCKSOURCE_WATCHDOG |
181 | static void clocksource_watchdog_work(struct work_struct *work); | 183 | static void clocksource_watchdog_work(struct work_struct *work); |
184 | static void clocksource_select(void); | ||
182 | 185 | ||
183 | static LIST_HEAD(watchdog_list); | 186 | static LIST_HEAD(watchdog_list); |
184 | static struct clocksource *watchdog; | 187 | static struct clocksource *watchdog; |
@@ -299,13 +302,30 @@ static void clocksource_watchdog(unsigned long data) | |||
299 | if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && | 302 | if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && |
300 | (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && | 303 | (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && |
301 | (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { | 304 | (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { |
305 | /* Mark it valid for high-res. */ | ||
302 | cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; | 306 | cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; |
307 | |||
308 | /* | ||
309 | * clocksource_done_booting() will sort it if | ||
310 | * finished_booting is not set yet. | ||
311 | */ | ||
312 | if (!finished_booting) | ||
313 | continue; | ||
314 | |||
303 | /* | 315 | /* |
304 | * We just marked the clocksource as highres-capable, | 316 | * If this is not the current clocksource let |
305 | * notify the rest of the system as well so that we | 317 | * the watchdog thread reselect it. Due to the |
306 | * transition into high-res mode: | 318 | * change to high res this clocksource might |
319 | * be preferred now. If it is the current | ||
320 | * clocksource let the tick code know about | ||
321 | * that change. | ||
307 | */ | 322 | */ |
308 | tick_clock_notify(); | 323 | if (cs != curr_clocksource) { |
324 | cs->flags |= CLOCK_SOURCE_RESELECT; | ||
325 | schedule_work(&watchdog_work); | ||
326 | } else { | ||
327 | tick_clock_notify(); | ||
328 | } | ||
309 | } | 329 | } |
310 | } | 330 | } |
311 | 331 | ||
@@ -388,44 +408,39 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) | |||
388 | 408 | ||
389 | static void clocksource_dequeue_watchdog(struct clocksource *cs) | 409 | static void clocksource_dequeue_watchdog(struct clocksource *cs) |
390 | { | 410 | { |
391 | struct clocksource *tmp; | ||
392 | unsigned long flags; | 411 | unsigned long flags; |
393 | 412 | ||
394 | spin_lock_irqsave(&watchdog_lock, flags); | 413 | spin_lock_irqsave(&watchdog_lock, flags); |
395 | if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { | 414 | if (cs != watchdog) { |
396 | /* cs is a watched clocksource. */ | 415 | if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { |
397 | list_del_init(&cs->wd_list); | 416 | /* cs is a watched clocksource. */ |
398 | } else if (cs == watchdog) { | 417 | list_del_init(&cs->wd_list); |
399 | /* Reset watchdog cycles */ | 418 | /* Check if the watchdog timer needs to be stopped. */ |
400 | clocksource_reset_watchdog(); | 419 | clocksource_stop_watchdog(); |
401 | /* Current watchdog is removed. Find an alternative. */ | ||
402 | watchdog = NULL; | ||
403 | list_for_each_entry(tmp, &clocksource_list, list) { | ||
404 | if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY) | ||
405 | continue; | ||
406 | if (!watchdog || tmp->rating > watchdog->rating) | ||
407 | watchdog = tmp; | ||
408 | } | 420 | } |
409 | } | 421 | } |
410 | cs->flags &= ~CLOCK_SOURCE_WATCHDOG; | ||
411 | /* Check if the watchdog timer needs to be stopped. */ | ||
412 | clocksource_stop_watchdog(); | ||
413 | spin_unlock_irqrestore(&watchdog_lock, flags); | 422 | spin_unlock_irqrestore(&watchdog_lock, flags); |
414 | } | 423 | } |
415 | 424 | ||
416 | static int clocksource_watchdog_kthread(void *data) | 425 | static int __clocksource_watchdog_kthread(void) |
417 | { | 426 | { |
418 | struct clocksource *cs, *tmp; | 427 | struct clocksource *cs, *tmp; |
419 | unsigned long flags; | 428 | unsigned long flags; |
420 | LIST_HEAD(unstable); | 429 | LIST_HEAD(unstable); |
430 | int select = 0; | ||
421 | 431 | ||
422 | mutex_lock(&clocksource_mutex); | ||
423 | spin_lock_irqsave(&watchdog_lock, flags); | 432 | spin_lock_irqsave(&watchdog_lock, flags); |
424 | list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) | 433 | list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { |
425 | if (cs->flags & CLOCK_SOURCE_UNSTABLE) { | 434 | if (cs->flags & CLOCK_SOURCE_UNSTABLE) { |
426 | list_del_init(&cs->wd_list); | 435 | list_del_init(&cs->wd_list); |
427 | list_add(&cs->wd_list, &unstable); | 436 | list_add(&cs->wd_list, &unstable); |
437 | select = 1; | ||
428 | } | 438 | } |
439 | if (cs->flags & CLOCK_SOURCE_RESELECT) { | ||
440 | cs->flags &= ~CLOCK_SOURCE_RESELECT; | ||
441 | select = 1; | ||
442 | } | ||
443 | } | ||
429 | /* Check if the watchdog timer needs to be stopped. */ | 444 | /* Check if the watchdog timer needs to be stopped. */ |
430 | clocksource_stop_watchdog(); | 445 | clocksource_stop_watchdog(); |
431 | spin_unlock_irqrestore(&watchdog_lock, flags); | 446 | spin_unlock_irqrestore(&watchdog_lock, flags); |
@@ -435,10 +450,23 @@ static int clocksource_watchdog_kthread(void *data) | |||
435 | list_del_init(&cs->wd_list); | 450 | list_del_init(&cs->wd_list); |
436 | __clocksource_change_rating(cs, 0); | 451 | __clocksource_change_rating(cs, 0); |
437 | } | 452 | } |
453 | return select; | ||
454 | } | ||
455 | |||
456 | static int clocksource_watchdog_kthread(void *data) | ||
457 | { | ||
458 | mutex_lock(&clocksource_mutex); | ||
459 | if (__clocksource_watchdog_kthread()) | ||
460 | clocksource_select(); | ||
438 | mutex_unlock(&clocksource_mutex); | 461 | mutex_unlock(&clocksource_mutex); |
439 | return 0; | 462 | return 0; |
440 | } | 463 | } |
441 | 464 | ||
465 | static bool clocksource_is_watchdog(struct clocksource *cs) | ||
466 | { | ||
467 | return cs == watchdog; | ||
468 | } | ||
469 | |||
442 | #else /* CONFIG_CLOCKSOURCE_WATCHDOG */ | 470 | #else /* CONFIG_CLOCKSOURCE_WATCHDOG */ |
443 | 471 | ||
444 | static void clocksource_enqueue_watchdog(struct clocksource *cs) | 472 | static void clocksource_enqueue_watchdog(struct clocksource *cs) |
@@ -449,7 +477,8 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) | |||
449 | 477 | ||
450 | static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } | 478 | static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } |
451 | static inline void clocksource_resume_watchdog(void) { } | 479 | static inline void clocksource_resume_watchdog(void) { } |
452 | static inline int clocksource_watchdog_kthread(void *data) { return 0; } | 480 | static inline int __clocksource_watchdog_kthread(void) { return 0; } |
481 | static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } | ||
453 | 482 | ||
454 | #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ | 483 | #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ |
455 | 484 | ||
@@ -553,24 +582,42 @@ static u64 clocksource_max_deferment(struct clocksource *cs) | |||
553 | 582 | ||
554 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET | 583 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET |
555 | 584 | ||
556 | /** | 585 | static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) |
557 | * clocksource_select - Select the best clocksource available | ||
558 | * | ||
559 | * Private function. Must hold clocksource_mutex when called. | ||
560 | * | ||
561 | * Select the clocksource with the best rating, or the clocksource, | ||
562 | * which is selected by userspace override. | ||
563 | */ | ||
564 | static void clocksource_select(void) | ||
565 | { | 586 | { |
566 | struct clocksource *best, *cs; | 587 | struct clocksource *cs; |
567 | 588 | ||
568 | if (!finished_booting || list_empty(&clocksource_list)) | 589 | if (!finished_booting || list_empty(&clocksource_list)) |
590 | return NULL; | ||
591 | |||
592 | /* | ||
593 | * We pick the clocksource with the highest rating. If oneshot | ||
594 | * mode is active, we pick the highres valid clocksource with | ||
595 | * the best rating. | ||
596 | */ | ||
597 | list_for_each_entry(cs, &clocksource_list, list) { | ||
598 | if (skipcur && cs == curr_clocksource) | ||
599 | continue; | ||
600 | if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) | ||
601 | continue; | ||
602 | return cs; | ||
603 | } | ||
604 | return NULL; | ||
605 | } | ||
606 | |||
607 | static void __clocksource_select(bool skipcur) | ||
608 | { | ||
609 | bool oneshot = tick_oneshot_mode_active(); | ||
610 | struct clocksource *best, *cs; | ||
611 | |||
612 | /* Find the best suitable clocksource */ | ||
613 | best = clocksource_find_best(oneshot, skipcur); | ||
614 | if (!best) | ||
569 | return; | 615 | return; |
570 | /* First clocksource on the list has the best rating. */ | 616 | |
571 | best = list_first_entry(&clocksource_list, struct clocksource, list); | ||
572 | /* Check for the override clocksource. */ | 617 | /* Check for the override clocksource. */ |
573 | list_for_each_entry(cs, &clocksource_list, list) { | 618 | list_for_each_entry(cs, &clocksource_list, list) { |
619 | if (skipcur && cs == curr_clocksource) | ||
620 | continue; | ||
574 | if (strcmp(cs->name, override_name) != 0) | 621 | if (strcmp(cs->name, override_name) != 0) |
575 | continue; | 622 | continue; |
576 | /* | 623 | /* |
@@ -578,8 +625,7 @@ static void clocksource_select(void) | |||
578 | * capable clocksource if the tick code is in oneshot | 625 | * capable clocksource if the tick code is in oneshot |
579 | * mode (highres or nohz) | 626 | * mode (highres or nohz) |
580 | */ | 627 | */ |
581 | if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && | 628 | if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { |
582 | tick_oneshot_mode_active()) { | ||
583 | /* Override clocksource cannot be used. */ | 629 | /* Override clocksource cannot be used. */ |
584 | printk(KERN_WARNING "Override clocksource %s is not " | 630 | printk(KERN_WARNING "Override clocksource %s is not " |
585 | "HRT compatible. Cannot switch while in " | 631 | "HRT compatible. Cannot switch while in " |
@@ -590,16 +636,35 @@ static void clocksource_select(void) | |||
590 | best = cs; | 636 | best = cs; |
591 | break; | 637 | break; |
592 | } | 638 | } |
593 | if (curr_clocksource != best) { | 639 | |
594 | printk(KERN_INFO "Switching to clocksource %s\n", best->name); | 640 | if (curr_clocksource != best && !timekeeping_notify(best)) { |
641 | pr_info("Switched to clocksource %s\n", best->name); | ||
595 | curr_clocksource = best; | 642 | curr_clocksource = best; |
596 | timekeeping_notify(curr_clocksource); | ||
597 | } | 643 | } |
598 | } | 644 | } |
599 | 645 | ||
646 | /** | ||
647 | * clocksource_select - Select the best clocksource available | ||
648 | * | ||
649 | * Private function. Must hold clocksource_mutex when called. | ||
650 | * | ||
651 | * Select the clocksource with the best rating, or the clocksource, | ||
652 | * which is selected by userspace override. | ||
653 | */ | ||
654 | static void clocksource_select(void) | ||
655 | { | ||
656 | return __clocksource_select(false); | ||
657 | } | ||
658 | |||
659 | static void clocksource_select_fallback(void) | ||
660 | { | ||
661 | return __clocksource_select(true); | ||
662 | } | ||
663 | |||
600 | #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ | 664 | #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ |
601 | 665 | ||
602 | static inline void clocksource_select(void) { } | 666 | static inline void clocksource_select(void) { } |
667 | static inline void clocksource_select_fallback(void) { } | ||
603 | 668 | ||
604 | #endif | 669 | #endif |
605 | 670 | ||
@@ -614,16 +679,11 @@ static int __init clocksource_done_booting(void) | |||
614 | { | 679 | { |
615 | mutex_lock(&clocksource_mutex); | 680 | mutex_lock(&clocksource_mutex); |
616 | curr_clocksource = clocksource_default_clock(); | 681 | curr_clocksource = clocksource_default_clock(); |
617 | mutex_unlock(&clocksource_mutex); | ||
618 | |||
619 | finished_booting = 1; | 682 | finished_booting = 1; |
620 | |||
621 | /* | 683 | /* |
622 | * Run the watchdog first to eliminate unstable clock sources | 684 | * Run the watchdog first to eliminate unstable clock sources |
623 | */ | 685 | */ |
624 | clocksource_watchdog_kthread(NULL); | 686 | __clocksource_watchdog_kthread(); |
625 | |||
626 | mutex_lock(&clocksource_mutex); | ||
627 | clocksource_select(); | 687 | clocksource_select(); |
628 | mutex_unlock(&clocksource_mutex); | 688 | mutex_unlock(&clocksource_mutex); |
629 | return 0; | 689 | return 0; |
@@ -756,7 +816,6 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating) | |||
756 | list_del(&cs->list); | 816 | list_del(&cs->list); |
757 | cs->rating = rating; | 817 | cs->rating = rating; |
758 | clocksource_enqueue(cs); | 818 | clocksource_enqueue(cs); |
759 | clocksource_select(); | ||
760 | } | 819 | } |
761 | 820 | ||
762 | /** | 821 | /** |
@@ -768,21 +827,47 @@ void clocksource_change_rating(struct clocksource *cs, int rating) | |||
768 | { | 827 | { |
769 | mutex_lock(&clocksource_mutex); | 828 | mutex_lock(&clocksource_mutex); |
770 | __clocksource_change_rating(cs, rating); | 829 | __clocksource_change_rating(cs, rating); |
830 | clocksource_select(); | ||
771 | mutex_unlock(&clocksource_mutex); | 831 | mutex_unlock(&clocksource_mutex); |
772 | } | 832 | } |
773 | EXPORT_SYMBOL(clocksource_change_rating); | 833 | EXPORT_SYMBOL(clocksource_change_rating); |
774 | 834 | ||
835 | /* | ||
836 | * Unbind clocksource @cs. Called with clocksource_mutex held | ||
837 | */ | ||
838 | static int clocksource_unbind(struct clocksource *cs) | ||
839 | { | ||
840 | /* | ||
841 | * I really can't convince myself to support this on hardware | ||
842 | * designed by lobotomized monkeys. | ||
843 | */ | ||
844 | if (clocksource_is_watchdog(cs)) | ||
845 | return -EBUSY; | ||
846 | |||
847 | if (cs == curr_clocksource) { | ||
848 | /* Select and try to install a replacement clock source */ | ||
849 | clocksource_select_fallback(); | ||
850 | if (curr_clocksource == cs) | ||
851 | return -EBUSY; | ||
852 | } | ||
853 | clocksource_dequeue_watchdog(cs); | ||
854 | list_del_init(&cs->list); | ||
855 | return 0; | ||
856 | } | ||
857 | |||
775 | /** | 858 | /** |
776 | * clocksource_unregister - remove a registered clocksource | 859 | * clocksource_unregister - remove a registered clocksource |
777 | * @cs: clocksource to be unregistered | 860 | * @cs: clocksource to be unregistered |
778 | */ | 861 | */ |
779 | void clocksource_unregister(struct clocksource *cs) | 862 | int clocksource_unregister(struct clocksource *cs) |
780 | { | 863 | { |
864 | int ret = 0; | ||
865 | |||
781 | mutex_lock(&clocksource_mutex); | 866 | mutex_lock(&clocksource_mutex); |
782 | clocksource_dequeue_watchdog(cs); | 867 | if (!list_empty(&cs->list)) |
783 | list_del(&cs->list); | 868 | ret = clocksource_unbind(cs); |
784 | clocksource_select(); | ||
785 | mutex_unlock(&clocksource_mutex); | 869 | mutex_unlock(&clocksource_mutex); |
870 | return ret; | ||
786 | } | 871 | } |
787 | EXPORT_SYMBOL(clocksource_unregister); | 872 | EXPORT_SYMBOL(clocksource_unregister); |
788 | 873 | ||
@@ -808,6 +893,23 @@ sysfs_show_current_clocksources(struct device *dev, | |||
808 | return count; | 893 | return count; |
809 | } | 894 | } |
810 | 895 | ||
896 | size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) | ||
897 | { | ||
898 | size_t ret = cnt; | ||
899 | |||
900 | /* strings from sysfs write are not 0 terminated! */ | ||
901 | if (!cnt || cnt >= CS_NAME_LEN) | ||
902 | return -EINVAL; | ||
903 | |||
904 | /* strip of \n: */ | ||
905 | if (buf[cnt-1] == '\n') | ||
906 | cnt--; | ||
907 | if (cnt > 0) | ||
908 | memcpy(dst, buf, cnt); | ||
909 | dst[cnt] = 0; | ||
910 | return ret; | ||
911 | } | ||
912 | |||
811 | /** | 913 | /** |
812 | * sysfs_override_clocksource - interface for manually overriding clocksource | 914 | * sysfs_override_clocksource - interface for manually overriding clocksource |
813 | * @dev: unused | 915 | * @dev: unused |
@@ -822,22 +924,13 @@ static ssize_t sysfs_override_clocksource(struct device *dev, | |||
822 | struct device_attribute *attr, | 924 | struct device_attribute *attr, |
823 | const char *buf, size_t count) | 925 | const char *buf, size_t count) |
824 | { | 926 | { |
825 | size_t ret = count; | 927 | size_t ret; |
826 | |||
827 | /* strings from sysfs write are not 0 terminated! */ | ||
828 | if (count >= sizeof(override_name)) | ||
829 | return -EINVAL; | ||
830 | |||
831 | /* strip of \n: */ | ||
832 | if (buf[count-1] == '\n') | ||
833 | count--; | ||
834 | 928 | ||
835 | mutex_lock(&clocksource_mutex); | 929 | mutex_lock(&clocksource_mutex); |
836 | 930 | ||
837 | if (count > 0) | 931 | ret = sysfs_get_uname(buf, override_name, count); |
838 | memcpy(override_name, buf, count); | 932 | if (ret >= 0) |
839 | override_name[count] = 0; | 933 | clocksource_select(); |
840 | clocksource_select(); | ||
841 | 934 | ||
842 | mutex_unlock(&clocksource_mutex); | 935 | mutex_unlock(&clocksource_mutex); |
843 | 936 | ||
@@ -845,6 +938,40 @@ static ssize_t sysfs_override_clocksource(struct device *dev, | |||
845 | } | 938 | } |
846 | 939 | ||
847 | /** | 940 | /** |
941 | * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource | ||
942 | * @dev: unused | ||
943 | * @attr: unused | ||
944 | * @buf: unused | ||
945 | * @count: length of buffer | ||
946 | * | ||
947 | * Takes input from sysfs interface for manually unbinding a clocksource. | ||
948 | */ | ||
949 | static ssize_t sysfs_unbind_clocksource(struct device *dev, | ||
950 | struct device_attribute *attr, | ||
951 | const char *buf, size_t count) | ||
952 | { | ||
953 | struct clocksource *cs; | ||
954 | char name[CS_NAME_LEN]; | ||
955 | size_t ret; | ||
956 | |||
957 | ret = sysfs_get_uname(buf, name, count); | ||
958 | if (ret < 0) | ||
959 | return ret; | ||
960 | |||
961 | ret = -ENODEV; | ||
962 | mutex_lock(&clocksource_mutex); | ||
963 | list_for_each_entry(cs, &clocksource_list, list) { | ||
964 | if (strcmp(cs->name, name)) | ||
965 | continue; | ||
966 | ret = clocksource_unbind(cs); | ||
967 | break; | ||
968 | } | ||
969 | mutex_unlock(&clocksource_mutex); | ||
970 | |||
971 | return ret ? ret : count; | ||
972 | } | ||
973 | |||
974 | /** | ||
848 | * sysfs_show_available_clocksources - sysfs interface for listing clocksource | 975 | * sysfs_show_available_clocksources - sysfs interface for listing clocksource |
849 | * @dev: unused | 976 | * @dev: unused |
850 | * @attr: unused | 977 | * @attr: unused |
@@ -886,6 +1013,8 @@ sysfs_show_available_clocksources(struct device *dev, | |||
886 | static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, | 1013 | static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, |
887 | sysfs_override_clocksource); | 1014 | sysfs_override_clocksource); |
888 | 1015 | ||
1016 | static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource); | ||
1017 | |||
889 | static DEVICE_ATTR(available_clocksource, 0444, | 1018 | static DEVICE_ATTR(available_clocksource, 0444, |
890 | sysfs_show_available_clocksources, NULL); | 1019 | sysfs_show_available_clocksources, NULL); |
891 | 1020 | ||
@@ -910,6 +1039,9 @@ static int __init init_clocksource_sysfs(void) | |||
910 | &device_clocksource, | 1039 | &device_clocksource, |
911 | &dev_attr_current_clocksource); | 1040 | &dev_attr_current_clocksource); |
912 | if (!error) | 1041 | if (!error) |
1042 | error = device_create_file(&device_clocksource, | ||
1043 | &dev_attr_unbind_clocksource); | ||
1044 | if (!error) | ||
913 | error = device_create_file( | 1045 | error = device_create_file( |
914 | &device_clocksource, | 1046 | &device_clocksource, |
915 | &dev_attr_available_clocksource); | 1047 | &dev_attr_available_clocksource); |
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c new file mode 100644 index 000000000000..a326f27d7f09 --- /dev/null +++ b/kernel/time/sched_clock.c | |||
@@ -0,0 +1,212 @@ | |||
1 | /* | ||
2 | * sched_clock.c: support for extending counters to full 64-bit ns counter | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | */ | ||
8 | #include <linux/clocksource.h> | ||
9 | #include <linux/init.h> | ||
10 | #include <linux/jiffies.h> | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/moduleparam.h> | ||
13 | #include <linux/sched.h> | ||
14 | #include <linux/syscore_ops.h> | ||
15 | #include <linux/timer.h> | ||
16 | #include <linux/sched_clock.h> | ||
17 | |||
18 | struct clock_data { | ||
19 | u64 epoch_ns; | ||
20 | u32 epoch_cyc; | ||
21 | u32 epoch_cyc_copy; | ||
22 | unsigned long rate; | ||
23 | u32 mult; | ||
24 | u32 shift; | ||
25 | bool suspended; | ||
26 | }; | ||
27 | |||
28 | static void sched_clock_poll(unsigned long wrap_ticks); | ||
29 | static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0); | ||
30 | static int irqtime = -1; | ||
31 | |||
32 | core_param(irqtime, irqtime, int, 0400); | ||
33 | |||
34 | static struct clock_data cd = { | ||
35 | .mult = NSEC_PER_SEC / HZ, | ||
36 | }; | ||
37 | |||
38 | static u32 __read_mostly sched_clock_mask = 0xffffffff; | ||
39 | |||
40 | static u32 notrace jiffy_sched_clock_read(void) | ||
41 | { | ||
42 | return (u32)(jiffies - INITIAL_JIFFIES); | ||
43 | } | ||
44 | |||
45 | static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; | ||
46 | |||
47 | static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) | ||
48 | { | ||
49 | return (cyc * mult) >> shift; | ||
50 | } | ||
51 | |||
52 | static unsigned long long notrace sched_clock_32(void) | ||
53 | { | ||
54 | u64 epoch_ns; | ||
55 | u32 epoch_cyc; | ||
56 | u32 cyc; | ||
57 | |||
58 | if (cd.suspended) | ||
59 | return cd.epoch_ns; | ||
60 | |||
61 | /* | ||
62 | * Load the epoch_cyc and epoch_ns atomically. We do this by | ||
63 | * ensuring that we always write epoch_cyc, epoch_ns and | ||
64 | * epoch_cyc_copy in strict order, and read them in strict order. | ||
65 | * If epoch_cyc and epoch_cyc_copy are not equal, then we're in | ||
66 | * the middle of an update, and we should repeat the load. | ||
67 | */ | ||
68 | do { | ||
69 | epoch_cyc = cd.epoch_cyc; | ||
70 | smp_rmb(); | ||
71 | epoch_ns = cd.epoch_ns; | ||
72 | smp_rmb(); | ||
73 | } while (epoch_cyc != cd.epoch_cyc_copy); | ||
74 | |||
75 | cyc = read_sched_clock(); | ||
76 | cyc = (cyc - epoch_cyc) & sched_clock_mask; | ||
77 | return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * Atomically update the sched_clock epoch. | ||
82 | */ | ||
83 | static void notrace update_sched_clock(void) | ||
84 | { | ||
85 | unsigned long flags; | ||
86 | u32 cyc; | ||
87 | u64 ns; | ||
88 | |||
89 | cyc = read_sched_clock(); | ||
90 | ns = cd.epoch_ns + | ||
91 | cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, | ||
92 | cd.mult, cd.shift); | ||
93 | /* | ||
94 | * Write epoch_cyc and epoch_ns in a way that the update is | ||
95 | * detectable in cyc_to_fixed_sched_clock(). | ||
96 | */ | ||
97 | raw_local_irq_save(flags); | ||
98 | cd.epoch_cyc_copy = cyc; | ||
99 | smp_wmb(); | ||
100 | cd.epoch_ns = ns; | ||
101 | smp_wmb(); | ||
102 | cd.epoch_cyc = cyc; | ||
103 | raw_local_irq_restore(flags); | ||
104 | } | ||
105 | |||
106 | static void sched_clock_poll(unsigned long wrap_ticks) | ||
107 | { | ||
108 | mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks)); | ||
109 | update_sched_clock(); | ||
110 | } | ||
111 | |||
112 | void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) | ||
113 | { | ||
114 | unsigned long r, w; | ||
115 | u64 res, wrap; | ||
116 | char r_unit; | ||
117 | |||
118 | if (cd.rate > rate) | ||
119 | return; | ||
120 | |||
121 | BUG_ON(bits > 32); | ||
122 | WARN_ON(!irqs_disabled()); | ||
123 | read_sched_clock = read; | ||
124 | sched_clock_mask = (1 << bits) - 1; | ||
125 | cd.rate = rate; | ||
126 | |||
127 | /* calculate the mult/shift to convert counter ticks to ns. */ | ||
128 | clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); | ||
129 | |||
130 | r = rate; | ||
131 | if (r >= 4000000) { | ||
132 | r /= 1000000; | ||
133 | r_unit = 'M'; | ||
134 | } else if (r >= 1000) { | ||
135 | r /= 1000; | ||
136 | r_unit = 'k'; | ||
137 | } else | ||
138 | r_unit = ' '; | ||
139 | |||
140 | /* calculate how many ns until we wrap */ | ||
141 | wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); | ||
142 | do_div(wrap, NSEC_PER_MSEC); | ||
143 | w = wrap; | ||
144 | |||
145 | /* calculate the ns resolution of this counter */ | ||
146 | res = cyc_to_ns(1ULL, cd.mult, cd.shift); | ||
147 | pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", | ||
148 | bits, r, r_unit, res, w); | ||
149 | |||
150 | /* | ||
151 | * Start the timer to keep sched_clock() properly updated and | ||
152 | * sets the initial epoch. | ||
153 | */ | ||
154 | sched_clock_timer.data = msecs_to_jiffies(w - (w / 10)); | ||
155 | update_sched_clock(); | ||
156 | |||
157 | /* | ||
158 | * Ensure that sched_clock() starts off at 0ns | ||
159 | */ | ||
160 | cd.epoch_ns = 0; | ||
161 | |||
162 | /* Enable IRQ time accounting if we have a fast enough sched_clock */ | ||
163 | if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) | ||
164 | enable_sched_clock_irqtime(); | ||
165 | |||
166 | pr_debug("Registered %pF as sched_clock source\n", read); | ||
167 | } | ||
168 | |||
169 | unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; | ||
170 | |||
171 | unsigned long long notrace sched_clock(void) | ||
172 | { | ||
173 | return sched_clock_func(); | ||
174 | } | ||
175 | |||
176 | void __init sched_clock_postinit(void) | ||
177 | { | ||
178 | /* | ||
179 | * If no sched_clock function has been provided at that point, | ||
180 | * make it the final one one. | ||
181 | */ | ||
182 | if (read_sched_clock == jiffy_sched_clock_read) | ||
183 | setup_sched_clock(jiffy_sched_clock_read, 32, HZ); | ||
184 | |||
185 | sched_clock_poll(sched_clock_timer.data); | ||
186 | } | ||
187 | |||
188 | static int sched_clock_suspend(void) | ||
189 | { | ||
190 | sched_clock_poll(sched_clock_timer.data); | ||
191 | cd.suspended = true; | ||
192 | return 0; | ||
193 | } | ||
194 | |||
195 | static void sched_clock_resume(void) | ||
196 | { | ||
197 | cd.epoch_cyc = read_sched_clock(); | ||
198 | cd.epoch_cyc_copy = cd.epoch_cyc; | ||
199 | cd.suspended = false; | ||
200 | } | ||
201 | |||
202 | static struct syscore_ops sched_clock_ops = { | ||
203 | .suspend = sched_clock_suspend, | ||
204 | .resume = sched_clock_resume, | ||
205 | }; | ||
206 | |||
207 | static int __init sched_clock_syscore_init(void) | ||
208 | { | ||
209 | register_syscore_ops(&sched_clock_ops); | ||
210 | return 0; | ||
211 | } | ||
212 | device_initcall(sched_clock_syscore_init); | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 20d6fba70652..6d3f91631de6 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/profile.h> | 19 | #include <linux/profile.h> |
20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
21 | #include <linux/smp.h> | 21 | #include <linux/smp.h> |
22 | #include <linux/module.h> | ||
22 | 23 | ||
23 | #include "tick-internal.h" | 24 | #include "tick-internal.h" |
24 | 25 | ||
@@ -29,6 +30,7 @@ | |||
29 | 30 | ||
30 | static struct tick_device tick_broadcast_device; | 31 | static struct tick_device tick_broadcast_device; |
31 | static cpumask_var_t tick_broadcast_mask; | 32 | static cpumask_var_t tick_broadcast_mask; |
33 | static cpumask_var_t tick_broadcast_on; | ||
32 | static cpumask_var_t tmpmask; | 34 | static cpumask_var_t tmpmask; |
33 | static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); | 35 | static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); |
34 | static int tick_broadcast_force; | 36 | static int tick_broadcast_force; |
@@ -64,17 +66,34 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) | |||
64 | /* | 66 | /* |
65 | * Check, if the device can be utilized as broadcast device: | 67 | * Check, if the device can be utilized as broadcast device: |
66 | */ | 68 | */ |
67 | int tick_check_broadcast_device(struct clock_event_device *dev) | 69 | static bool tick_check_broadcast_device(struct clock_event_device *curdev, |
70 | struct clock_event_device *newdev) | ||
71 | { | ||
72 | if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || | ||
73 | (newdev->features & CLOCK_EVT_FEAT_C3STOP)) | ||
74 | return false; | ||
75 | |||
76 | if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT && | ||
77 | !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) | ||
78 | return false; | ||
79 | |||
80 | return !curdev || newdev->rating > curdev->rating; | ||
81 | } | ||
82 | |||
83 | /* | ||
84 | * Conditionally install/replace broadcast device | ||
85 | */ | ||
86 | void tick_install_broadcast_device(struct clock_event_device *dev) | ||
68 | { | 87 | { |
69 | struct clock_event_device *cur = tick_broadcast_device.evtdev; | 88 | struct clock_event_device *cur = tick_broadcast_device.evtdev; |
70 | 89 | ||
71 | if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || | 90 | if (!tick_check_broadcast_device(cur, dev)) |
72 | (tick_broadcast_device.evtdev && | 91 | return; |
73 | tick_broadcast_device.evtdev->rating >= dev->rating) || | ||
74 | (dev->features & CLOCK_EVT_FEAT_C3STOP)) | ||
75 | return 0; | ||
76 | 92 | ||
77 | clockevents_exchange_device(tick_broadcast_device.evtdev, dev); | 93 | if (!try_module_get(dev->owner)) |
94 | return; | ||
95 | |||
96 | clockevents_exchange_device(cur, dev); | ||
78 | if (cur) | 97 | if (cur) |
79 | cur->event_handler = clockevents_handle_noop; | 98 | cur->event_handler = clockevents_handle_noop; |
80 | tick_broadcast_device.evtdev = dev; | 99 | tick_broadcast_device.evtdev = dev; |
@@ -90,7 +109,6 @@ int tick_check_broadcast_device(struct clock_event_device *dev) | |||
90 | */ | 109 | */ |
91 | if (dev->features & CLOCK_EVT_FEAT_ONESHOT) | 110 | if (dev->features & CLOCK_EVT_FEAT_ONESHOT) |
92 | tick_clock_notify(); | 111 | tick_clock_notify(); |
93 | return 1; | ||
94 | } | 112 | } |
95 | 113 | ||
96 | /* | 114 | /* |
@@ -123,8 +141,9 @@ static void tick_device_setup_broadcast_func(struct clock_event_device *dev) | |||
123 | */ | 141 | */ |
124 | int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) | 142 | int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) |
125 | { | 143 | { |
144 | struct clock_event_device *bc = tick_broadcast_device.evtdev; | ||
126 | unsigned long flags; | 145 | unsigned long flags; |
127 | int ret = 0; | 146 | int ret; |
128 | 147 | ||
129 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 148 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); |
130 | 149 | ||
@@ -138,20 +157,59 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) | |||
138 | dev->event_handler = tick_handle_periodic; | 157 | dev->event_handler = tick_handle_periodic; |
139 | tick_device_setup_broadcast_func(dev); | 158 | tick_device_setup_broadcast_func(dev); |
140 | cpumask_set_cpu(cpu, tick_broadcast_mask); | 159 | cpumask_set_cpu(cpu, tick_broadcast_mask); |
141 | tick_broadcast_start_periodic(tick_broadcast_device.evtdev); | 160 | tick_broadcast_start_periodic(bc); |
142 | ret = 1; | 161 | ret = 1; |
143 | } else { | 162 | } else { |
144 | /* | 163 | /* |
145 | * When the new device is not affected by the stop | 164 | * Clear the broadcast bit for this cpu if the |
146 | * feature and the cpu is marked in the broadcast mask | 165 | * device is not power state affected. |
147 | * then clear the broadcast bit. | ||
148 | */ | 166 | */ |
149 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { | 167 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) |
150 | int cpu = smp_processor_id(); | ||
151 | cpumask_clear_cpu(cpu, tick_broadcast_mask); | 168 | cpumask_clear_cpu(cpu, tick_broadcast_mask); |
152 | tick_broadcast_clear_oneshot(cpu); | 169 | else |
153 | } else { | ||
154 | tick_device_setup_broadcast_func(dev); | 170 | tick_device_setup_broadcast_func(dev); |
171 | |||
172 | /* | ||
173 | * Clear the broadcast bit if the CPU is not in | ||
174 | * periodic broadcast on state. | ||
175 | */ | ||
176 | if (!cpumask_test_cpu(cpu, tick_broadcast_on)) | ||
177 | cpumask_clear_cpu(cpu, tick_broadcast_mask); | ||
178 | |||
179 | switch (tick_broadcast_device.mode) { | ||
180 | case TICKDEV_MODE_ONESHOT: | ||
181 | /* | ||
182 | * If the system is in oneshot mode we can | ||
183 | * unconditionally clear the oneshot mask bit, | ||
184 | * because the CPU is running and therefore | ||
185 | * not in an idle state which causes the power | ||
186 | * state affected device to stop. Let the | ||
187 | * caller initialize the device. | ||
188 | */ | ||
189 | tick_broadcast_clear_oneshot(cpu); | ||
190 | ret = 0; | ||
191 | break; | ||
192 | |||
193 | case TICKDEV_MODE_PERIODIC: | ||
194 | /* | ||
195 | * If the system is in periodic mode, check | ||
196 | * whether the broadcast device can be | ||
197 | * switched off now. | ||
198 | */ | ||
199 | if (cpumask_empty(tick_broadcast_mask) && bc) | ||
200 | clockevents_shutdown(bc); | ||
201 | /* | ||
202 | * If we kept the cpu in the broadcast mask, | ||
203 | * tell the caller to leave the per cpu device | ||
204 | * in shutdown state. The periodic interrupt | ||
205 | * is delivered by the broadcast device. | ||
206 | */ | ||
207 | ret = cpumask_test_cpu(cpu, tick_broadcast_mask); | ||
208 | break; | ||
209 | default: | ||
210 | /* Nothing to do */ | ||
211 | ret = 0; | ||
212 | break; | ||
155 | } | 213 | } |
156 | } | 214 | } |
157 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 215 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
@@ -281,6 +339,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) | |||
281 | switch (*reason) { | 339 | switch (*reason) { |
282 | case CLOCK_EVT_NOTIFY_BROADCAST_ON: | 340 | case CLOCK_EVT_NOTIFY_BROADCAST_ON: |
283 | case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: | 341 | case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: |
342 | cpumask_set_cpu(cpu, tick_broadcast_on); | ||
284 | if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { | 343 | if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { |
285 | if (tick_broadcast_device.mode == | 344 | if (tick_broadcast_device.mode == |
286 | TICKDEV_MODE_PERIODIC) | 345 | TICKDEV_MODE_PERIODIC) |
@@ -290,8 +349,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason) | |||
290 | tick_broadcast_force = 1; | 349 | tick_broadcast_force = 1; |
291 | break; | 350 | break; |
292 | case CLOCK_EVT_NOTIFY_BROADCAST_OFF: | 351 | case CLOCK_EVT_NOTIFY_BROADCAST_OFF: |
293 | if (!tick_broadcast_force && | 352 | if (tick_broadcast_force) |
294 | cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { | 353 | break; |
354 | cpumask_clear_cpu(cpu, tick_broadcast_on); | ||
355 | if (!tick_device_is_functional(dev)) | ||
356 | break; | ||
357 | if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { | ||
295 | if (tick_broadcast_device.mode == | 358 | if (tick_broadcast_device.mode == |
296 | TICKDEV_MODE_PERIODIC) | 359 | TICKDEV_MODE_PERIODIC) |
297 | tick_setup_periodic(dev, 0); | 360 | tick_setup_periodic(dev, 0); |
@@ -349,6 +412,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) | |||
349 | 412 | ||
350 | bc = tick_broadcast_device.evtdev; | 413 | bc = tick_broadcast_device.evtdev; |
351 | cpumask_clear_cpu(cpu, tick_broadcast_mask); | 414 | cpumask_clear_cpu(cpu, tick_broadcast_mask); |
415 | cpumask_clear_cpu(cpu, tick_broadcast_on); | ||
352 | 416 | ||
353 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { | 417 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { |
354 | if (bc && cpumask_empty(tick_broadcast_mask)) | 418 | if (bc && cpumask_empty(tick_broadcast_mask)) |
@@ -475,7 +539,15 @@ void tick_check_oneshot_broadcast(int cpu) | |||
475 | if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) { | 539 | if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) { |
476 | struct tick_device *td = &per_cpu(tick_cpu_device, cpu); | 540 | struct tick_device *td = &per_cpu(tick_cpu_device, cpu); |
477 | 541 | ||
478 | clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); | 542 | /* |
543 | * We might be in the middle of switching over from | ||
544 | * periodic to oneshot. If the CPU has not yet | ||
545 | * switched over, leave the device alone. | ||
546 | */ | ||
547 | if (td->mode == TICKDEV_MODE_ONESHOT) { | ||
548 | clockevents_set_mode(td->evtdev, | ||
549 | CLOCK_EVT_MODE_ONESHOT); | ||
550 | } | ||
479 | } | 551 | } |
480 | } | 552 | } |
481 | 553 | ||
@@ -522,6 +594,13 @@ again: | |||
522 | cpumask_clear(tick_broadcast_force_mask); | 594 | cpumask_clear(tick_broadcast_force_mask); |
523 | 595 | ||
524 | /* | 596 | /* |
597 | * Sanity check. Catch the case where we try to broadcast to | ||
598 | * offline cpus. | ||
599 | */ | ||
600 | if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask))) | ||
601 | cpumask_and(tmpmask, tmpmask, cpu_online_mask); | ||
602 | |||
603 | /* | ||
525 | * Wakeup the cpus which have an expired event. | 604 | * Wakeup the cpus which have an expired event. |
526 | */ | 605 | */ |
527 | tick_do_broadcast(tmpmask); | 606 | tick_do_broadcast(tmpmask); |
@@ -761,10 +840,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) | |||
761 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 840 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); |
762 | 841 | ||
763 | /* | 842 | /* |
764 | * Clear the broadcast mask flag for the dead cpu, but do not | 843 | * Clear the broadcast masks for the dead cpu, but do not stop |
765 | * stop the broadcast device! | 844 | * the broadcast device! |
766 | */ | 845 | */ |
767 | cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); | 846 | cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); |
847 | cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); | ||
848 | cpumask_clear_cpu(cpu, tick_broadcast_force_mask); | ||
768 | 849 | ||
769 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 850 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
770 | } | 851 | } |
@@ -792,6 +873,7 @@ bool tick_broadcast_oneshot_available(void) | |||
792 | void __init tick_broadcast_init(void) | 873 | void __init tick_broadcast_init(void) |
793 | { | 874 | { |
794 | zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); | 875 | zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); |
876 | zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT); | ||
795 | zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); | 877 | zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); |
796 | #ifdef CONFIG_TICK_ONESHOT | 878 | #ifdef CONFIG_TICK_ONESHOT |
797 | zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); | 879 | zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 5d3fb100bc06..64522ecdfe0e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/percpu.h> | 18 | #include <linux/percpu.h> |
19 | #include <linux/profile.h> | 19 | #include <linux/profile.h> |
20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
21 | #include <linux/module.h> | ||
21 | 22 | ||
22 | #include <asm/irq_regs.h> | 23 | #include <asm/irq_regs.h> |
23 | 24 | ||
@@ -33,7 +34,6 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); | |||
33 | ktime_t tick_next_period; | 34 | ktime_t tick_next_period; |
34 | ktime_t tick_period; | 35 | ktime_t tick_period; |
35 | int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; | 36 | int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; |
36 | static DEFINE_RAW_SPINLOCK(tick_device_lock); | ||
37 | 37 | ||
38 | /* | 38 | /* |
39 | * Debugging: see timer_list.c | 39 | * Debugging: see timer_list.c |
@@ -194,7 +194,8 @@ static void tick_setup_device(struct tick_device *td, | |||
194 | * When global broadcasting is active, check if the current | 194 | * When global broadcasting is active, check if the current |
195 | * device is registered as a placeholder for broadcast mode. | 195 | * device is registered as a placeholder for broadcast mode. |
196 | * This allows us to handle this x86 misfeature in a generic | 196 | * This allows us to handle this x86 misfeature in a generic |
197 | * way. | 197 | * way. This function also returns !=0 when we keep the |
198 | * current active broadcast state for this CPU. | ||
198 | */ | 199 | */ |
199 | if (tick_device_uses_broadcast(newdev, cpu)) | 200 | if (tick_device_uses_broadcast(newdev, cpu)) |
200 | return; | 201 | return; |
@@ -205,17 +206,75 @@ static void tick_setup_device(struct tick_device *td, | |||
205 | tick_setup_oneshot(newdev, handler, next_event); | 206 | tick_setup_oneshot(newdev, handler, next_event); |
206 | } | 207 | } |
207 | 208 | ||
209 | void tick_install_replacement(struct clock_event_device *newdev) | ||
210 | { | ||
211 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); | ||
212 | int cpu = smp_processor_id(); | ||
213 | |||
214 | clockevents_exchange_device(td->evtdev, newdev); | ||
215 | tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); | ||
216 | if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) | ||
217 | tick_oneshot_notify(); | ||
218 | } | ||
219 | |||
220 | static bool tick_check_percpu(struct clock_event_device *curdev, | ||
221 | struct clock_event_device *newdev, int cpu) | ||
222 | { | ||
223 | if (!cpumask_test_cpu(cpu, newdev->cpumask)) | ||
224 | return false; | ||
225 | if (cpumask_equal(newdev->cpumask, cpumask_of(cpu))) | ||
226 | return true; | ||
227 | /* Check if irq affinity can be set */ | ||
228 | if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq)) | ||
229 | return false; | ||
230 | /* Prefer an existing cpu local device */ | ||
231 | if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) | ||
232 | return false; | ||
233 | return true; | ||
234 | } | ||
235 | |||
236 | static bool tick_check_preferred(struct clock_event_device *curdev, | ||
237 | struct clock_event_device *newdev) | ||
238 | { | ||
239 | /* Prefer oneshot capable device */ | ||
240 | if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) { | ||
241 | if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT)) | ||
242 | return false; | ||
243 | if (tick_oneshot_mode_active()) | ||
244 | return false; | ||
245 | } | ||
246 | |||
247 | /* | ||
248 | * Use the higher rated one, but prefer a CPU local device with a lower | ||
249 | * rating than a non-CPU local device | ||
250 | */ | ||
251 | return !curdev || | ||
252 | newdev->rating > curdev->rating || | ||
253 | !cpumask_equal(curdev->cpumask, newdev->cpumask); | ||
254 | } | ||
255 | |||
256 | /* | ||
257 | * Check whether the new device is a better fit than curdev. curdev | ||
258 | * can be NULL ! | ||
259 | */ | ||
260 | bool tick_check_replacement(struct clock_event_device *curdev, | ||
261 | struct clock_event_device *newdev) | ||
262 | { | ||
263 | if (tick_check_percpu(curdev, newdev, smp_processor_id())) | ||
264 | return false; | ||
265 | |||
266 | return tick_check_preferred(curdev, newdev); | ||
267 | } | ||
268 | |||
208 | /* | 269 | /* |
209 | * Check, if the new registered device should be used. | 270 | * Check, if the new registered device should be used. Called with |
271 | * clockevents_lock held and interrupts disabled. | ||
210 | */ | 272 | */ |
211 | static int tick_check_new_device(struct clock_event_device *newdev) | 273 | void tick_check_new_device(struct clock_event_device *newdev) |
212 | { | 274 | { |
213 | struct clock_event_device *curdev; | 275 | struct clock_event_device *curdev; |
214 | struct tick_device *td; | 276 | struct tick_device *td; |
215 | int cpu, ret = NOTIFY_OK; | 277 | int cpu; |
216 | unsigned long flags; | ||
217 | |||
218 | raw_spin_lock_irqsave(&tick_device_lock, flags); | ||
219 | 278 | ||
220 | cpu = smp_processor_id(); | 279 | cpu = smp_processor_id(); |
221 | if (!cpumask_test_cpu(cpu, newdev->cpumask)) | 280 | if (!cpumask_test_cpu(cpu, newdev->cpumask)) |
@@ -225,40 +284,15 @@ static int tick_check_new_device(struct clock_event_device *newdev) | |||
225 | curdev = td->evtdev; | 284 | curdev = td->evtdev; |
226 | 285 | ||
227 | /* cpu local device ? */ | 286 | /* cpu local device ? */ |
228 | if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { | 287 | if (!tick_check_percpu(curdev, newdev, cpu)) |
229 | 288 | goto out_bc; | |
230 | /* | ||
231 | * If the cpu affinity of the device interrupt can not | ||
232 | * be set, ignore it. | ||
233 | */ | ||
234 | if (!irq_can_set_affinity(newdev->irq)) | ||
235 | goto out_bc; | ||
236 | 289 | ||
237 | /* | 290 | /* Preference decision */ |
238 | * If we have a cpu local device already, do not replace it | 291 | if (!tick_check_preferred(curdev, newdev)) |
239 | * by a non cpu local device | 292 | goto out_bc; |
240 | */ | ||
241 | if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) | ||
242 | goto out_bc; | ||
243 | } | ||
244 | 293 | ||
245 | /* | 294 | if (!try_module_get(newdev->owner)) |
246 | * If we have an active device, then check the rating and the oneshot | 295 | return; |
247 | * feature. | ||
248 | */ | ||
249 | if (curdev) { | ||
250 | /* | ||
251 | * Prefer one shot capable devices ! | ||
252 | */ | ||
253 | if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && | ||
254 | !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) | ||
255 | goto out_bc; | ||
256 | /* | ||
257 | * Check the rating | ||
258 | */ | ||
259 | if (curdev->rating >= newdev->rating) | ||
260 | goto out_bc; | ||
261 | } | ||
262 | 296 | ||
263 | /* | 297 | /* |
264 | * Replace the eventually existing device by the new | 298 | * Replace the eventually existing device by the new |
@@ -273,20 +307,13 @@ static int tick_check_new_device(struct clock_event_device *newdev) | |||
273 | tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); | 307 | tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); |
274 | if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) | 308 | if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) |
275 | tick_oneshot_notify(); | 309 | tick_oneshot_notify(); |
276 | 310 | return; | |
277 | raw_spin_unlock_irqrestore(&tick_device_lock, flags); | ||
278 | return NOTIFY_STOP; | ||
279 | 311 | ||
280 | out_bc: | 312 | out_bc: |
281 | /* | 313 | /* |
282 | * Can the new device be used as a broadcast device ? | 314 | * Can the new device be used as a broadcast device ? |
283 | */ | 315 | */ |
284 | if (tick_check_broadcast_device(newdev)) | 316 | tick_install_broadcast_device(newdev); |
285 | ret = NOTIFY_STOP; | ||
286 | |||
287 | raw_spin_unlock_irqrestore(&tick_device_lock, flags); | ||
288 | |||
289 | return ret; | ||
290 | } | 317 | } |
291 | 318 | ||
292 | /* | 319 | /* |
@@ -294,7 +321,7 @@ out_bc: | |||
294 | * | 321 | * |
295 | * Called with interrupts disabled. | 322 | * Called with interrupts disabled. |
296 | */ | 323 | */ |
297 | static void tick_handover_do_timer(int *cpup) | 324 | void tick_handover_do_timer(int *cpup) |
298 | { | 325 | { |
299 | if (*cpup == tick_do_timer_cpu) { | 326 | if (*cpup == tick_do_timer_cpu) { |
300 | int cpu = cpumask_first(cpu_online_mask); | 327 | int cpu = cpumask_first(cpu_online_mask); |
@@ -311,13 +338,11 @@ static void tick_handover_do_timer(int *cpup) | |||
311 | * access the hardware device itself. | 338 | * access the hardware device itself. |
312 | * We just set the mode and remove it from the lists. | 339 | * We just set the mode and remove it from the lists. |
313 | */ | 340 | */ |
314 | static void tick_shutdown(unsigned int *cpup) | 341 | void tick_shutdown(unsigned int *cpup) |
315 | { | 342 | { |
316 | struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); | 343 | struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); |
317 | struct clock_event_device *dev = td->evtdev; | 344 | struct clock_event_device *dev = td->evtdev; |
318 | unsigned long flags; | ||
319 | 345 | ||
320 | raw_spin_lock_irqsave(&tick_device_lock, flags); | ||
321 | td->mode = TICKDEV_MODE_PERIODIC; | 346 | td->mode = TICKDEV_MODE_PERIODIC; |
322 | if (dev) { | 347 | if (dev) { |
323 | /* | 348 | /* |
@@ -329,26 +354,20 @@ static void tick_shutdown(unsigned int *cpup) | |||
329 | dev->event_handler = clockevents_handle_noop; | 354 | dev->event_handler = clockevents_handle_noop; |
330 | td->evtdev = NULL; | 355 | td->evtdev = NULL; |
331 | } | 356 | } |
332 | raw_spin_unlock_irqrestore(&tick_device_lock, flags); | ||
333 | } | 357 | } |
334 | 358 | ||
335 | static void tick_suspend(void) | 359 | void tick_suspend(void) |
336 | { | 360 | { |
337 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); | 361 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); |
338 | unsigned long flags; | ||
339 | 362 | ||
340 | raw_spin_lock_irqsave(&tick_device_lock, flags); | ||
341 | clockevents_shutdown(td->evtdev); | 363 | clockevents_shutdown(td->evtdev); |
342 | raw_spin_unlock_irqrestore(&tick_device_lock, flags); | ||
343 | } | 364 | } |
344 | 365 | ||
345 | static void tick_resume(void) | 366 | void tick_resume(void) |
346 | { | 367 | { |
347 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); | 368 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); |
348 | unsigned long flags; | ||
349 | int broadcast = tick_resume_broadcast(); | 369 | int broadcast = tick_resume_broadcast(); |
350 | 370 | ||
351 | raw_spin_lock_irqsave(&tick_device_lock, flags); | ||
352 | clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); | 371 | clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); |
353 | 372 | ||
354 | if (!broadcast) { | 373 | if (!broadcast) { |
@@ -357,68 +376,12 @@ static void tick_resume(void) | |||
357 | else | 376 | else |
358 | tick_resume_oneshot(); | 377 | tick_resume_oneshot(); |
359 | } | 378 | } |
360 | raw_spin_unlock_irqrestore(&tick_device_lock, flags); | ||
361 | } | 379 | } |
362 | 380 | ||
363 | /* | ||
364 | * Notification about clock event devices | ||
365 | */ | ||
366 | static int tick_notify(struct notifier_block *nb, unsigned long reason, | ||
367 | void *dev) | ||
368 | { | ||
369 | switch (reason) { | ||
370 | |||
371 | case CLOCK_EVT_NOTIFY_ADD: | ||
372 | return tick_check_new_device(dev); | ||
373 | |||
374 | case CLOCK_EVT_NOTIFY_BROADCAST_ON: | ||
375 | case CLOCK_EVT_NOTIFY_BROADCAST_OFF: | ||
376 | case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: | ||
377 | tick_broadcast_on_off(reason, dev); | ||
378 | break; | ||
379 | |||
380 | case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: | ||
381 | case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: | ||
382 | tick_broadcast_oneshot_control(reason); | ||
383 | break; | ||
384 | |||
385 | case CLOCK_EVT_NOTIFY_CPU_DYING: | ||
386 | tick_handover_do_timer(dev); | ||
387 | break; | ||
388 | |||
389 | case CLOCK_EVT_NOTIFY_CPU_DEAD: | ||
390 | tick_shutdown_broadcast_oneshot(dev); | ||
391 | tick_shutdown_broadcast(dev); | ||
392 | tick_shutdown(dev); | ||
393 | break; | ||
394 | |||
395 | case CLOCK_EVT_NOTIFY_SUSPEND: | ||
396 | tick_suspend(); | ||
397 | tick_suspend_broadcast(); | ||
398 | break; | ||
399 | |||
400 | case CLOCK_EVT_NOTIFY_RESUME: | ||
401 | tick_resume(); | ||
402 | break; | ||
403 | |||
404 | default: | ||
405 | break; | ||
406 | } | ||
407 | |||
408 | return NOTIFY_OK; | ||
409 | } | ||
410 | |||
411 | static struct notifier_block tick_notifier = { | ||
412 | .notifier_call = tick_notify, | ||
413 | }; | ||
414 | |||
415 | /** | 381 | /** |
416 | * tick_init - initialize the tick control | 382 | * tick_init - initialize the tick control |
417 | * | ||
418 | * Register the notifier with the clockevents framework | ||
419 | */ | 383 | */ |
420 | void __init tick_init(void) | 384 | void __init tick_init(void) |
421 | { | 385 | { |
422 | clockevents_register_notifier(&tick_notifier); | ||
423 | tick_broadcast_init(); | 386 | tick_broadcast_init(); |
424 | } | 387 | } |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f0299eae4602..bc906cad709b 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -6,6 +6,8 @@ | |||
6 | 6 | ||
7 | extern seqlock_t jiffies_lock; | 7 | extern seqlock_t jiffies_lock; |
8 | 8 | ||
9 | #define CS_NAME_LEN 32 | ||
10 | |||
9 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD | 11 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD |
10 | 12 | ||
11 | #define TICK_DO_TIMER_NONE -1 | 13 | #define TICK_DO_TIMER_NONE -1 |
@@ -18,9 +20,19 @@ extern int tick_do_timer_cpu __read_mostly; | |||
18 | 20 | ||
19 | extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); | 21 | extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); |
20 | extern void tick_handle_periodic(struct clock_event_device *dev); | 22 | extern void tick_handle_periodic(struct clock_event_device *dev); |
23 | extern void tick_check_new_device(struct clock_event_device *dev); | ||
24 | extern void tick_handover_do_timer(int *cpup); | ||
25 | extern void tick_shutdown(unsigned int *cpup); | ||
26 | extern void tick_suspend(void); | ||
27 | extern void tick_resume(void); | ||
28 | extern bool tick_check_replacement(struct clock_event_device *curdev, | ||
29 | struct clock_event_device *newdev); | ||
30 | extern void tick_install_replacement(struct clock_event_device *dev); | ||
21 | 31 | ||
22 | extern void clockevents_shutdown(struct clock_event_device *dev); | 32 | extern void clockevents_shutdown(struct clock_event_device *dev); |
23 | 33 | ||
34 | extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); | ||
35 | |||
24 | /* | 36 | /* |
25 | * NO_HZ / high resolution timer shared code | 37 | * NO_HZ / high resolution timer shared code |
26 | */ | 38 | */ |
@@ -90,7 +102,7 @@ static inline bool tick_broadcast_oneshot_available(void) { return false; } | |||
90 | */ | 102 | */ |
91 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | 103 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST |
92 | extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); | 104 | extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); |
93 | extern int tick_check_broadcast_device(struct clock_event_device *dev); | 105 | extern void tick_install_broadcast_device(struct clock_event_device *dev); |
94 | extern int tick_is_broadcast_device(struct clock_event_device *dev); | 106 | extern int tick_is_broadcast_device(struct clock_event_device *dev); |
95 | extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); | 107 | extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); |
96 | extern void tick_shutdown_broadcast(unsigned int *cpup); | 108 | extern void tick_shutdown_broadcast(unsigned int *cpup); |
@@ -102,9 +114,8 @@ tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); | |||
102 | 114 | ||
103 | #else /* !BROADCAST */ | 115 | #else /* !BROADCAST */ |
104 | 116 | ||
105 | static inline int tick_check_broadcast_device(struct clock_event_device *dev) | 117 | static inline void tick_install_broadcast_device(struct clock_event_device *dev) |
106 | { | 118 | { |
107 | return 0; | ||
108 | } | 119 | } |
109 | 120 | ||
110 | static inline int tick_is_broadcast_device(struct clock_event_device *dev) | 121 | static inline int tick_is_broadcast_device(struct clock_event_device *dev) |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index baeeb5c87cf1..48b9fffabdc2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -25,6 +25,11 @@ | |||
25 | 25 | ||
26 | #include "tick-internal.h" | 26 | #include "tick-internal.h" |
27 | #include "ntp_internal.h" | 27 | #include "ntp_internal.h" |
28 | #include "timekeeping_internal.h" | ||
29 | |||
30 | #define TK_CLEAR_NTP (1 << 0) | ||
31 | #define TK_MIRROR (1 << 1) | ||
32 | #define TK_CLOCK_WAS_SET (1 << 2) | ||
28 | 33 | ||
29 | static struct timekeeper timekeeper; | 34 | static struct timekeeper timekeeper; |
30 | static DEFINE_RAW_SPINLOCK(timekeeper_lock); | 35 | static DEFINE_RAW_SPINLOCK(timekeeper_lock); |
@@ -200,9 +205,9 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) | |||
200 | 205 | ||
201 | static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); | 206 | static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); |
202 | 207 | ||
203 | static void update_pvclock_gtod(struct timekeeper *tk) | 208 | static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) |
204 | { | 209 | { |
205 | raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk); | 210 | raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk); |
206 | } | 211 | } |
207 | 212 | ||
208 | /** | 213 | /** |
@@ -216,7 +221,7 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb) | |||
216 | 221 | ||
217 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 222 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
218 | ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); | 223 | ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); |
219 | update_pvclock_gtod(tk); | 224 | update_pvclock_gtod(tk, true); |
220 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 225 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
221 | 226 | ||
222 | return ret; | 227 | return ret; |
@@ -241,16 +246,16 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) | |||
241 | EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); | 246 | EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); |
242 | 247 | ||
243 | /* must hold timekeeper_lock */ | 248 | /* must hold timekeeper_lock */ |
244 | static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror) | 249 | static void timekeeping_update(struct timekeeper *tk, unsigned int action) |
245 | { | 250 | { |
246 | if (clearntp) { | 251 | if (action & TK_CLEAR_NTP) { |
247 | tk->ntp_error = 0; | 252 | tk->ntp_error = 0; |
248 | ntp_clear(); | 253 | ntp_clear(); |
249 | } | 254 | } |
250 | update_vsyscall(tk); | 255 | update_vsyscall(tk); |
251 | update_pvclock_gtod(tk); | 256 | update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); |
252 | 257 | ||
253 | if (mirror) | 258 | if (action & TK_MIRROR) |
254 | memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); | 259 | memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); |
255 | } | 260 | } |
256 | 261 | ||
@@ -508,7 +513,7 @@ int do_settimeofday(const struct timespec *tv) | |||
508 | 513 | ||
509 | tk_set_xtime(tk, tv); | 514 | tk_set_xtime(tk, tv); |
510 | 515 | ||
511 | timekeeping_update(tk, true, true); | 516 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); |
512 | 517 | ||
513 | write_seqcount_end(&timekeeper_seq); | 518 | write_seqcount_end(&timekeeper_seq); |
514 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 519 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
@@ -552,7 +557,7 @@ int timekeeping_inject_offset(struct timespec *ts) | |||
552 | tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); | 557 | tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); |
553 | 558 | ||
554 | error: /* even if we error out, we forwarded the time, so call update */ | 559 | error: /* even if we error out, we forwarded the time, so call update */ |
555 | timekeeping_update(tk, true, true); | 560 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); |
556 | 561 | ||
557 | write_seqcount_end(&timekeeper_seq); | 562 | write_seqcount_end(&timekeeper_seq); |
558 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 563 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
@@ -627,13 +632,22 @@ static int change_clocksource(void *data) | |||
627 | write_seqcount_begin(&timekeeper_seq); | 632 | write_seqcount_begin(&timekeeper_seq); |
628 | 633 | ||
629 | timekeeping_forward_now(tk); | 634 | timekeeping_forward_now(tk); |
630 | if (!new->enable || new->enable(new) == 0) { | 635 | /* |
631 | old = tk->clock; | 636 | * If the cs is in module, get a module reference. Succeeds |
632 | tk_setup_internals(tk, new); | 637 | * for built-in code (owner == NULL) as well. |
633 | if (old->disable) | 638 | */ |
634 | old->disable(old); | 639 | if (try_module_get(new->owner)) { |
640 | if (!new->enable || new->enable(new) == 0) { | ||
641 | old = tk->clock; | ||
642 | tk_setup_internals(tk, new); | ||
643 | if (old->disable) | ||
644 | old->disable(old); | ||
645 | module_put(old->owner); | ||
646 | } else { | ||
647 | module_put(new->owner); | ||
648 | } | ||
635 | } | 649 | } |
636 | timekeeping_update(tk, true, true); | 650 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); |
637 | 651 | ||
638 | write_seqcount_end(&timekeeper_seq); | 652 | write_seqcount_end(&timekeeper_seq); |
639 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 653 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
@@ -648,14 +662,15 @@ static int change_clocksource(void *data) | |||
648 | * This function is called from clocksource.c after a new, better clock | 662 | * This function is called from clocksource.c after a new, better clock |
649 | * source has been registered. The caller holds the clocksource_mutex. | 663 | * source has been registered. The caller holds the clocksource_mutex. |
650 | */ | 664 | */ |
651 | void timekeeping_notify(struct clocksource *clock) | 665 | int timekeeping_notify(struct clocksource *clock) |
652 | { | 666 | { |
653 | struct timekeeper *tk = &timekeeper; | 667 | struct timekeeper *tk = &timekeeper; |
654 | 668 | ||
655 | if (tk->clock == clock) | 669 | if (tk->clock == clock) |
656 | return; | 670 | return 0; |
657 | stop_machine(change_clocksource, clock, NULL); | 671 | stop_machine(change_clocksource, clock, NULL); |
658 | tick_clock_notify(); | 672 | tick_clock_notify(); |
673 | return tk->clock == clock ? 0 : -1; | ||
659 | } | 674 | } |
660 | 675 | ||
661 | /** | 676 | /** |
@@ -841,6 +856,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, | |||
841 | tk_xtime_add(tk, delta); | 856 | tk_xtime_add(tk, delta); |
842 | tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); | 857 | tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); |
843 | tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); | 858 | tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); |
859 | tk_debug_account_sleep_time(delta); | ||
844 | } | 860 | } |
845 | 861 | ||
846 | /** | 862 | /** |
@@ -872,7 +888,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) | |||
872 | 888 | ||
873 | __timekeeping_inject_sleeptime(tk, delta); | 889 | __timekeeping_inject_sleeptime(tk, delta); |
874 | 890 | ||
875 | timekeeping_update(tk, true, true); | 891 | timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); |
876 | 892 | ||
877 | write_seqcount_end(&timekeeper_seq); | 893 | write_seqcount_end(&timekeeper_seq); |
878 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 894 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
@@ -954,7 +970,7 @@ static void timekeeping_resume(void) | |||
954 | tk->cycle_last = clock->cycle_last = cycle_now; | 970 | tk->cycle_last = clock->cycle_last = cycle_now; |
955 | tk->ntp_error = 0; | 971 | tk->ntp_error = 0; |
956 | timekeeping_suspended = 0; | 972 | timekeeping_suspended = 0; |
957 | timekeeping_update(tk, false, true); | 973 | timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); |
958 | write_seqcount_end(&timekeeper_seq); | 974 | write_seqcount_end(&timekeeper_seq); |
959 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 975 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
960 | 976 | ||
@@ -1236,9 +1252,10 @@ out_adjust: | |||
1236 | * It also calls into the NTP code to handle leapsecond processing. | 1252 | * It also calls into the NTP code to handle leapsecond processing. |
1237 | * | 1253 | * |
1238 | */ | 1254 | */ |
1239 | static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) | 1255 | static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) |
1240 | { | 1256 | { |
1241 | u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; | 1257 | u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; |
1258 | unsigned int action = 0; | ||
1242 | 1259 | ||
1243 | while (tk->xtime_nsec >= nsecps) { | 1260 | while (tk->xtime_nsec >= nsecps) { |
1244 | int leap; | 1261 | int leap; |
@@ -1261,8 +1278,10 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) | |||
1261 | __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); | 1278 | __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); |
1262 | 1279 | ||
1263 | clock_was_set_delayed(); | 1280 | clock_was_set_delayed(); |
1281 | action = TK_CLOCK_WAS_SET; | ||
1264 | } | 1282 | } |
1265 | } | 1283 | } |
1284 | return action; | ||
1266 | } | 1285 | } |
1267 | 1286 | ||
1268 | /** | 1287 | /** |
@@ -1347,6 +1366,7 @@ static void update_wall_time(void) | |||
1347 | struct timekeeper *tk = &shadow_timekeeper; | 1366 | struct timekeeper *tk = &shadow_timekeeper; |
1348 | cycle_t offset; | 1367 | cycle_t offset; |
1349 | int shift = 0, maxshift; | 1368 | int shift = 0, maxshift; |
1369 | unsigned int action; | ||
1350 | unsigned long flags; | 1370 | unsigned long flags; |
1351 | 1371 | ||
1352 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 1372 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
@@ -1399,7 +1419,7 @@ static void update_wall_time(void) | |||
1399 | * Finally, make sure that after the rounding | 1419 | * Finally, make sure that after the rounding |
1400 | * xtime_nsec isn't larger than NSEC_PER_SEC | 1420 | * xtime_nsec isn't larger than NSEC_PER_SEC |
1401 | */ | 1421 | */ |
1402 | accumulate_nsecs_to_secs(tk); | 1422 | action = accumulate_nsecs_to_secs(tk); |
1403 | 1423 | ||
1404 | write_seqcount_begin(&timekeeper_seq); | 1424 | write_seqcount_begin(&timekeeper_seq); |
1405 | /* Update clock->cycle_last with the new value */ | 1425 | /* Update clock->cycle_last with the new value */ |
@@ -1415,7 +1435,7 @@ static void update_wall_time(void) | |||
1415 | * updating. | 1435 | * updating. |
1416 | */ | 1436 | */ |
1417 | memcpy(real_tk, tk, sizeof(*tk)); | 1437 | memcpy(real_tk, tk, sizeof(*tk)); |
1418 | timekeeping_update(real_tk, false, false); | 1438 | timekeeping_update(real_tk, action); |
1419 | write_seqcount_end(&timekeeper_seq); | 1439 | write_seqcount_end(&timekeeper_seq); |
1420 | out: | 1440 | out: |
1421 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1441 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
@@ -1677,6 +1697,7 @@ int do_adjtimex(struct timex *txc) | |||
1677 | 1697 | ||
1678 | if (tai != orig_tai) { | 1698 | if (tai != orig_tai) { |
1679 | __timekeeping_set_tai_offset(tk, tai); | 1699 | __timekeeping_set_tai_offset(tk, tai); |
1700 | update_pvclock_gtod(tk, true); | ||
1680 | clock_was_set_delayed(); | 1701 | clock_was_set_delayed(); |
1681 | } | 1702 | } |
1682 | write_seqcount_end(&timekeeper_seq); | 1703 | write_seqcount_end(&timekeeper_seq); |
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c new file mode 100644 index 000000000000..802433a4f5eb --- /dev/null +++ b/kernel/time/timekeeping_debug.c | |||
@@ -0,0 +1,72 @@ | |||
1 | /* | ||
2 | * debugfs file to track time spent in suspend | ||
3 | * | ||
4 | * Copyright (c) 2011, Google, Inc. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, but WITHOUT | ||
12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
14 | * more details. | ||
15 | */ | ||
16 | |||
17 | #include <linux/debugfs.h> | ||
18 | #include <linux/err.h> | ||
19 | #include <linux/init.h> | ||
20 | #include <linux/kernel.h> | ||
21 | #include <linux/seq_file.h> | ||
22 | #include <linux/time.h> | ||
23 | |||
24 | static unsigned int sleep_time_bin[32] = {0}; | ||
25 | |||
26 | static int tk_debug_show_sleep_time(struct seq_file *s, void *data) | ||
27 | { | ||
28 | unsigned int bin; | ||
29 | seq_puts(s, " time (secs) count\n"); | ||
30 | seq_puts(s, "------------------------------\n"); | ||
31 | for (bin = 0; bin < 32; bin++) { | ||
32 | if (sleep_time_bin[bin] == 0) | ||
33 | continue; | ||
34 | seq_printf(s, "%10u - %-10u %4u\n", | ||
35 | bin ? 1 << (bin - 1) : 0, 1 << bin, | ||
36 | sleep_time_bin[bin]); | ||
37 | } | ||
38 | return 0; | ||
39 | } | ||
40 | |||
41 | static int tk_debug_sleep_time_open(struct inode *inode, struct file *file) | ||
42 | { | ||
43 | return single_open(file, tk_debug_show_sleep_time, NULL); | ||
44 | } | ||
45 | |||
46 | static const struct file_operations tk_debug_sleep_time_fops = { | ||
47 | .open = tk_debug_sleep_time_open, | ||
48 | .read = seq_read, | ||
49 | .llseek = seq_lseek, | ||
50 | .release = single_release, | ||
51 | }; | ||
52 | |||
53 | static int __init tk_debug_sleep_time_init(void) | ||
54 | { | ||
55 | struct dentry *d; | ||
56 | |||
57 | d = debugfs_create_file("sleep_time", 0444, NULL, NULL, | ||
58 | &tk_debug_sleep_time_fops); | ||
59 | if (!d) { | ||
60 | pr_err("Failed to create sleep_time debug file\n"); | ||
61 | return -ENOMEM; | ||
62 | } | ||
63 | |||
64 | return 0; | ||
65 | } | ||
66 | late_initcall(tk_debug_sleep_time_init); | ||
67 | |||
68 | void tk_debug_account_sleep_time(struct timespec *t) | ||
69 | { | ||
70 | sleep_time_bin[fls(t->tv_sec)]++; | ||
71 | } | ||
72 | |||
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h new file mode 100644 index 000000000000..13323ea08ffa --- /dev/null +++ b/kernel/time/timekeeping_internal.h | |||
@@ -0,0 +1,14 @@ | |||
1 | #ifndef _TIMEKEEPING_INTERNAL_H | ||
2 | #define _TIMEKEEPING_INTERNAL_H | ||
3 | /* | ||
4 | * timekeeping debug functions | ||
5 | */ | ||
6 | #include <linux/time.h> | ||
7 | |||
8 | #ifdef CONFIG_DEBUG_FS | ||
9 | extern void tk_debug_account_sleep_time(struct timespec *t); | ||
10 | #else | ||
11 | #define tk_debug_account_sleep_time(x) | ||
12 | #endif | ||
13 | |||
14 | #endif /* _TIMEKEEPING_INTERNAL_H */ | ||
diff --git a/kernel/timer.c b/kernel/timer.c index 15ffdb3f1948..15bc1b41021d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -149,9 +149,11 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu, | |||
149 | /* now that we have rounded, subtract the extra skew again */ | 149 | /* now that we have rounded, subtract the extra skew again */ |
150 | j -= cpu * 3; | 150 | j -= cpu * 3; |
151 | 151 | ||
152 | if (j <= jiffies) /* rounding ate our timeout entirely; */ | 152 | /* |
153 | return original; | 153 | * Make sure j is still in the future. Otherwise return the |
154 | return j; | 154 | * unmodified value. |
155 | */ | ||
156 | return time_is_after_jiffies(j) ? j : original; | ||
155 | } | 157 | } |
156 | 158 | ||
157 | /** | 159 | /** |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6c508ff33c62..67708f46baae 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -413,6 +413,17 @@ static int __register_ftrace_function(struct ftrace_ops *ops) | |||
413 | return 0; | 413 | return 0; |
414 | } | 414 | } |
415 | 415 | ||
416 | static void ftrace_sync(struct work_struct *work) | ||
417 | { | ||
418 | /* | ||
419 | * This function is just a stub to implement a hard force | ||
420 | * of synchronize_sched(). This requires synchronizing | ||
421 | * tasks even in userspace and idle. | ||
422 | * | ||
423 | * Yes, function tracing is rude. | ||
424 | */ | ||
425 | } | ||
426 | |||
416 | static int __unregister_ftrace_function(struct ftrace_ops *ops) | 427 | static int __unregister_ftrace_function(struct ftrace_ops *ops) |
417 | { | 428 | { |
418 | int ret; | 429 | int ret; |
@@ -440,8 +451,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) | |||
440 | * so there'll be no new users. We must ensure | 451 | * so there'll be no new users. We must ensure |
441 | * all current users are done before we free | 452 | * all current users are done before we free |
442 | * the control data. | 453 | * the control data. |
454 | * Note synchronize_sched() is not enough, as we | ||
455 | * use preempt_disable() to do RCU, but the function | ||
456 | * tracer can be called where RCU is not active | ||
457 | * (before user_exit()). | ||
443 | */ | 458 | */ |
444 | synchronize_sched(); | 459 | schedule_on_each_cpu(ftrace_sync); |
445 | control_ops_free(ops); | 460 | control_ops_free(ops); |
446 | } | 461 | } |
447 | } else | 462 | } else |
@@ -456,9 +471,13 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) | |||
456 | /* | 471 | /* |
457 | * Dynamic ops may be freed, we must make sure that all | 472 | * Dynamic ops may be freed, we must make sure that all |
458 | * callers are done before leaving this function. | 473 | * callers are done before leaving this function. |
474 | * | ||
475 | * Again, normal synchronize_sched() is not good enough. | ||
476 | * We need to do a hard force of sched synchronization. | ||
459 | */ | 477 | */ |
460 | if (ops->flags & FTRACE_OPS_FL_DYNAMIC) | 478 | if (ops->flags & FTRACE_OPS_FL_DYNAMIC) |
461 | synchronize_sched(); | 479 | schedule_on_each_cpu(ftrace_sync); |
480 | |||
462 | 481 | ||
463 | return 0; | 482 | return 0; |
464 | } | 483 | } |
@@ -622,12 +641,18 @@ static int function_stat_show(struct seq_file *m, void *v) | |||
622 | if (rec->counter <= 1) | 641 | if (rec->counter <= 1) |
623 | stddev = 0; | 642 | stddev = 0; |
624 | else { | 643 | else { |
625 | stddev = rec->time_squared - rec->counter * avg * avg; | 644 | /* |
645 | * Apply Welford's method: | ||
646 | * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) | ||
647 | */ | ||
648 | stddev = rec->counter * rec->time_squared - | ||
649 | rec->time * rec->time; | ||
650 | |||
626 | /* | 651 | /* |
627 | * Divide only 1000 for ns^2 -> us^2 conversion. | 652 | * Divide only 1000 for ns^2 -> us^2 conversion. |
628 | * trace_print_graph_duration will divide 1000 again. | 653 | * trace_print_graph_duration will divide 1000 again. |
629 | */ | 654 | */ |
630 | do_div(stddev, (rec->counter - 1) * 1000); | 655 | do_div(stddev, rec->counter * (rec->counter - 1) * 1000); |
631 | } | 656 | } |
632 | 657 | ||
633 | trace_seq_init(&s); | 658 | trace_seq_init(&s); |
@@ -3512,8 +3537,12 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_notrace); | |||
3512 | static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; | 3537 | static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; |
3513 | static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; | 3538 | static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; |
3514 | 3539 | ||
3540 | /* Used by function selftest to not test if filter is set */ | ||
3541 | bool ftrace_filter_param __initdata; | ||
3542 | |||
3515 | static int __init set_ftrace_notrace(char *str) | 3543 | static int __init set_ftrace_notrace(char *str) |
3516 | { | 3544 | { |
3545 | ftrace_filter_param = true; | ||
3517 | strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); | 3546 | strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); |
3518 | return 1; | 3547 | return 1; |
3519 | } | 3548 | } |
@@ -3521,6 +3550,7 @@ __setup("ftrace_notrace=", set_ftrace_notrace); | |||
3521 | 3550 | ||
3522 | static int __init set_ftrace_filter(char *str) | 3551 | static int __init set_ftrace_filter(char *str) |
3523 | { | 3552 | { |
3553 | ftrace_filter_param = true; | ||
3524 | strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); | 3554 | strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); |
3525 | return 1; | 3555 | return 1; |
3526 | } | 3556 | } |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e71a8be4a6ee..0cd500bffd9b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -115,6 +115,9 @@ cpumask_var_t __read_mostly tracing_buffer_mask; | |||
115 | 115 | ||
116 | enum ftrace_dump_mode ftrace_dump_on_oops; | 116 | enum ftrace_dump_mode ftrace_dump_on_oops; |
117 | 117 | ||
118 | /* When set, tracing will stop when a WARN*() is hit */ | ||
119 | int __disable_trace_on_warning; | ||
120 | |||
118 | static int tracing_set_tracer(const char *buf); | 121 | static int tracing_set_tracer(const char *buf); |
119 | 122 | ||
120 | #define MAX_TRACER_SIZE 100 | 123 | #define MAX_TRACER_SIZE 100 |
@@ -149,6 +152,13 @@ static int __init set_ftrace_dump_on_oops(char *str) | |||
149 | } | 152 | } |
150 | __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); | 153 | __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); |
151 | 154 | ||
155 | static int __init stop_trace_on_warning(char *str) | ||
156 | { | ||
157 | __disable_trace_on_warning = 1; | ||
158 | return 1; | ||
159 | } | ||
160 | __setup("traceoff_on_warning=", stop_trace_on_warning); | ||
161 | |||
152 | static int __init boot_alloc_snapshot(char *str) | 162 | static int __init boot_alloc_snapshot(char *str) |
153 | { | 163 | { |
154 | allocate_snapshot = true; | 164 | allocate_snapshot = true; |
@@ -170,6 +180,7 @@ static int __init set_trace_boot_options(char *str) | |||
170 | } | 180 | } |
171 | __setup("trace_options=", set_trace_boot_options); | 181 | __setup("trace_options=", set_trace_boot_options); |
172 | 182 | ||
183 | |||
173 | unsigned long long ns2usecs(cycle_t nsec) | 184 | unsigned long long ns2usecs(cycle_t nsec) |
174 | { | 185 | { |
175 | nsec += 500; | 186 | nsec += 500; |
@@ -193,6 +204,37 @@ static struct trace_array global_trace; | |||
193 | 204 | ||
194 | LIST_HEAD(ftrace_trace_arrays); | 205 | LIST_HEAD(ftrace_trace_arrays); |
195 | 206 | ||
207 | int trace_array_get(struct trace_array *this_tr) | ||
208 | { | ||
209 | struct trace_array *tr; | ||
210 | int ret = -ENODEV; | ||
211 | |||
212 | mutex_lock(&trace_types_lock); | ||
213 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { | ||
214 | if (tr == this_tr) { | ||
215 | tr->ref++; | ||
216 | ret = 0; | ||
217 | break; | ||
218 | } | ||
219 | } | ||
220 | mutex_unlock(&trace_types_lock); | ||
221 | |||
222 | return ret; | ||
223 | } | ||
224 | |||
225 | static void __trace_array_put(struct trace_array *this_tr) | ||
226 | { | ||
227 | WARN_ON(!this_tr->ref); | ||
228 | this_tr->ref--; | ||
229 | } | ||
230 | |||
231 | void trace_array_put(struct trace_array *this_tr) | ||
232 | { | ||
233 | mutex_lock(&trace_types_lock); | ||
234 | __trace_array_put(this_tr); | ||
235 | mutex_unlock(&trace_types_lock); | ||
236 | } | ||
237 | |||
196 | int filter_current_check_discard(struct ring_buffer *buffer, | 238 | int filter_current_check_discard(struct ring_buffer *buffer, |
197 | struct ftrace_event_call *call, void *rec, | 239 | struct ftrace_event_call *call, void *rec, |
198 | struct ring_buffer_event *event) | 240 | struct ring_buffer_event *event) |
@@ -215,9 +257,24 @@ cycle_t ftrace_now(int cpu) | |||
215 | return ts; | 257 | return ts; |
216 | } | 258 | } |
217 | 259 | ||
260 | /** | ||
261 | * tracing_is_enabled - Show if global_trace has been disabled | ||
262 | * | ||
263 | * Shows if the global trace has been enabled or not. It uses the | ||
264 | * mirror flag "buffer_disabled" to be used in fast paths such as for | ||
265 | * the irqsoff tracer. But it may be inaccurate due to races. If you | ||
266 | * need to know the accurate state, use tracing_is_on() which is a little | ||
267 | * slower, but accurate. | ||
268 | */ | ||
218 | int tracing_is_enabled(void) | 269 | int tracing_is_enabled(void) |
219 | { | 270 | { |
220 | return tracing_is_on(); | 271 | /* |
272 | * For quick access (irqsoff uses this in fast path), just | ||
273 | * return the mirror variable of the state of the ring buffer. | ||
274 | * It's a little racy, but we don't really care. | ||
275 | */ | ||
276 | smp_rmb(); | ||
277 | return !global_trace.buffer_disabled; | ||
221 | } | 278 | } |
222 | 279 | ||
223 | /* | 280 | /* |
@@ -240,7 +297,7 @@ static struct tracer *trace_types __read_mostly; | |||
240 | /* | 297 | /* |
241 | * trace_types_lock is used to protect the trace_types list. | 298 | * trace_types_lock is used to protect the trace_types list. |
242 | */ | 299 | */ |
243 | static DEFINE_MUTEX(trace_types_lock); | 300 | DEFINE_MUTEX(trace_types_lock); |
244 | 301 | ||
245 | /* | 302 | /* |
246 | * serialize the access of the ring buffer | 303 | * serialize the access of the ring buffer |
@@ -330,6 +387,23 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | |||
330 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | | 387 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | |
331 | TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; | 388 | TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; |
332 | 389 | ||
390 | static void tracer_tracing_on(struct trace_array *tr) | ||
391 | { | ||
392 | if (tr->trace_buffer.buffer) | ||
393 | ring_buffer_record_on(tr->trace_buffer.buffer); | ||
394 | /* | ||
395 | * This flag is looked at when buffers haven't been allocated | ||
396 | * yet, or by some tracers (like irqsoff), that just want to | ||
397 | * know if the ring buffer has been disabled, but it can handle | ||
398 | * races of where it gets disabled but we still do a record. | ||
399 | * As the check is in the fast path of the tracers, it is more | ||
400 | * important to be fast than accurate. | ||
401 | */ | ||
402 | tr->buffer_disabled = 0; | ||
403 | /* Make the flag seen by readers */ | ||
404 | smp_wmb(); | ||
405 | } | ||
406 | |||
333 | /** | 407 | /** |
334 | * tracing_on - enable tracing buffers | 408 | * tracing_on - enable tracing buffers |
335 | * | 409 | * |
@@ -338,15 +412,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | |||
338 | */ | 412 | */ |
339 | void tracing_on(void) | 413 | void tracing_on(void) |
340 | { | 414 | { |
341 | if (global_trace.trace_buffer.buffer) | 415 | tracer_tracing_on(&global_trace); |
342 | ring_buffer_record_on(global_trace.trace_buffer.buffer); | ||
343 | /* | ||
344 | * This flag is only looked at when buffers haven't been | ||
345 | * allocated yet. We don't really care about the race | ||
346 | * between setting this flag and actually turning | ||
347 | * on the buffer. | ||
348 | */ | ||
349 | global_trace.buffer_disabled = 0; | ||
350 | } | 416 | } |
351 | EXPORT_SYMBOL_GPL(tracing_on); | 417 | EXPORT_SYMBOL_GPL(tracing_on); |
352 | 418 | ||
@@ -540,6 +606,23 @@ void tracing_snapshot_alloc(void) | |||
540 | EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); | 606 | EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); |
541 | #endif /* CONFIG_TRACER_SNAPSHOT */ | 607 | #endif /* CONFIG_TRACER_SNAPSHOT */ |
542 | 608 | ||
609 | static void tracer_tracing_off(struct trace_array *tr) | ||
610 | { | ||
611 | if (tr->trace_buffer.buffer) | ||
612 | ring_buffer_record_off(tr->trace_buffer.buffer); | ||
613 | /* | ||
614 | * This flag is looked at when buffers haven't been allocated | ||
615 | * yet, or by some tracers (like irqsoff), that just want to | ||
616 | * know if the ring buffer has been disabled, but it can handle | ||
617 | * races of where it gets disabled but we still do a record. | ||
618 | * As the check is in the fast path of the tracers, it is more | ||
619 | * important to be fast than accurate. | ||
620 | */ | ||
621 | tr->buffer_disabled = 1; | ||
622 | /* Make the flag seen by readers */ | ||
623 | smp_wmb(); | ||
624 | } | ||
625 | |||
543 | /** | 626 | /** |
544 | * tracing_off - turn off tracing buffers | 627 | * tracing_off - turn off tracing buffers |
545 | * | 628 | * |
@@ -550,26 +633,35 @@ EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); | |||
550 | */ | 633 | */ |
551 | void tracing_off(void) | 634 | void tracing_off(void) |
552 | { | 635 | { |
553 | if (global_trace.trace_buffer.buffer) | 636 | tracer_tracing_off(&global_trace); |
554 | ring_buffer_record_off(global_trace.trace_buffer.buffer); | ||
555 | /* | ||
556 | * This flag is only looked at when buffers haven't been | ||
557 | * allocated yet. We don't really care about the race | ||
558 | * between setting this flag and actually turning | ||
559 | * on the buffer. | ||
560 | */ | ||
561 | global_trace.buffer_disabled = 1; | ||
562 | } | 637 | } |
563 | EXPORT_SYMBOL_GPL(tracing_off); | 638 | EXPORT_SYMBOL_GPL(tracing_off); |
564 | 639 | ||
640 | void disable_trace_on_warning(void) | ||
641 | { | ||
642 | if (__disable_trace_on_warning) | ||
643 | tracing_off(); | ||
644 | } | ||
645 | |||
646 | /** | ||
647 | * tracer_tracing_is_on - show real state of ring buffer enabled | ||
648 | * @tr : the trace array to know if ring buffer is enabled | ||
649 | * | ||
650 | * Shows real state of the ring buffer if it is enabled or not. | ||
651 | */ | ||
652 | static int tracer_tracing_is_on(struct trace_array *tr) | ||
653 | { | ||
654 | if (tr->trace_buffer.buffer) | ||
655 | return ring_buffer_record_is_on(tr->trace_buffer.buffer); | ||
656 | return !tr->buffer_disabled; | ||
657 | } | ||
658 | |||
565 | /** | 659 | /** |
566 | * tracing_is_on - show state of ring buffers enabled | 660 | * tracing_is_on - show state of ring buffers enabled |
567 | */ | 661 | */ |
568 | int tracing_is_on(void) | 662 | int tracing_is_on(void) |
569 | { | 663 | { |
570 | if (global_trace.trace_buffer.buffer) | 664 | return tracer_tracing_is_on(&global_trace); |
571 | return ring_buffer_record_is_on(global_trace.trace_buffer.buffer); | ||
572 | return !global_trace.buffer_disabled; | ||
573 | } | 665 | } |
574 | EXPORT_SYMBOL_GPL(tracing_is_on); | 666 | EXPORT_SYMBOL_GPL(tracing_is_on); |
575 | 667 | ||
@@ -1543,15 +1635,6 @@ trace_function(struct trace_array *tr, | |||
1543 | __buffer_unlock_commit(buffer, event); | 1635 | __buffer_unlock_commit(buffer, event); |
1544 | } | 1636 | } |
1545 | 1637 | ||
1546 | void | ||
1547 | ftrace(struct trace_array *tr, struct trace_array_cpu *data, | ||
1548 | unsigned long ip, unsigned long parent_ip, unsigned long flags, | ||
1549 | int pc) | ||
1550 | { | ||
1551 | if (likely(!atomic_read(&data->disabled))) | ||
1552 | trace_function(tr, ip, parent_ip, flags, pc); | ||
1553 | } | ||
1554 | |||
1555 | #ifdef CONFIG_STACKTRACE | 1638 | #ifdef CONFIG_STACKTRACE |
1556 | 1639 | ||
1557 | #define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) | 1640 | #define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) |
@@ -2768,10 +2851,9 @@ static const struct seq_operations tracer_seq_ops = { | |||
2768 | }; | 2851 | }; |
2769 | 2852 | ||
2770 | static struct trace_iterator * | 2853 | static struct trace_iterator * |
2771 | __tracing_open(struct inode *inode, struct file *file, bool snapshot) | 2854 | __tracing_open(struct trace_array *tr, struct trace_cpu *tc, |
2855 | struct inode *inode, struct file *file, bool snapshot) | ||
2772 | { | 2856 | { |
2773 | struct trace_cpu *tc = inode->i_private; | ||
2774 | struct trace_array *tr = tc->tr; | ||
2775 | struct trace_iterator *iter; | 2857 | struct trace_iterator *iter; |
2776 | int cpu; | 2858 | int cpu; |
2777 | 2859 | ||
@@ -2850,8 +2932,6 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) | |||
2850 | tracing_iter_reset(iter, cpu); | 2932 | tracing_iter_reset(iter, cpu); |
2851 | } | 2933 | } |
2852 | 2934 | ||
2853 | tr->ref++; | ||
2854 | |||
2855 | mutex_unlock(&trace_types_lock); | 2935 | mutex_unlock(&trace_types_lock); |
2856 | 2936 | ||
2857 | return iter; | 2937 | return iter; |
@@ -2874,6 +2954,43 @@ int tracing_open_generic(struct inode *inode, struct file *filp) | |||
2874 | return 0; | 2954 | return 0; |
2875 | } | 2955 | } |
2876 | 2956 | ||
2957 | /* | ||
2958 | * Open and update trace_array ref count. | ||
2959 | * Must have the current trace_array passed to it. | ||
2960 | */ | ||
2961 | static int tracing_open_generic_tr(struct inode *inode, struct file *filp) | ||
2962 | { | ||
2963 | struct trace_array *tr = inode->i_private; | ||
2964 | |||
2965 | if (tracing_disabled) | ||
2966 | return -ENODEV; | ||
2967 | |||
2968 | if (trace_array_get(tr) < 0) | ||
2969 | return -ENODEV; | ||
2970 | |||
2971 | filp->private_data = inode->i_private; | ||
2972 | |||
2973 | return 0; | ||
2974 | |||
2975 | } | ||
2976 | |||
2977 | static int tracing_open_generic_tc(struct inode *inode, struct file *filp) | ||
2978 | { | ||
2979 | struct trace_cpu *tc = inode->i_private; | ||
2980 | struct trace_array *tr = tc->tr; | ||
2981 | |||
2982 | if (tracing_disabled) | ||
2983 | return -ENODEV; | ||
2984 | |||
2985 | if (trace_array_get(tr) < 0) | ||
2986 | return -ENODEV; | ||
2987 | |||
2988 | filp->private_data = inode->i_private; | ||
2989 | |||
2990 | return 0; | ||
2991 | |||
2992 | } | ||
2993 | |||
2877 | static int tracing_release(struct inode *inode, struct file *file) | 2994 | static int tracing_release(struct inode *inode, struct file *file) |
2878 | { | 2995 | { |
2879 | struct seq_file *m = file->private_data; | 2996 | struct seq_file *m = file->private_data; |
@@ -2881,17 +2998,20 @@ static int tracing_release(struct inode *inode, struct file *file) | |||
2881 | struct trace_array *tr; | 2998 | struct trace_array *tr; |
2882 | int cpu; | 2999 | int cpu; |
2883 | 3000 | ||
2884 | if (!(file->f_mode & FMODE_READ)) | 3001 | /* Writes do not use seq_file, need to grab tr from inode */ |
3002 | if (!(file->f_mode & FMODE_READ)) { | ||
3003 | struct trace_cpu *tc = inode->i_private; | ||
3004 | |||
3005 | trace_array_put(tc->tr); | ||
2885 | return 0; | 3006 | return 0; |
3007 | } | ||
2886 | 3008 | ||
2887 | iter = m->private; | 3009 | iter = m->private; |
2888 | tr = iter->tr; | 3010 | tr = iter->tr; |
3011 | trace_array_put(tr); | ||
2889 | 3012 | ||
2890 | mutex_lock(&trace_types_lock); | 3013 | mutex_lock(&trace_types_lock); |
2891 | 3014 | ||
2892 | WARN_ON(!tr->ref); | ||
2893 | tr->ref--; | ||
2894 | |||
2895 | for_each_tracing_cpu(cpu) { | 3015 | for_each_tracing_cpu(cpu) { |
2896 | if (iter->buffer_iter[cpu]) | 3016 | if (iter->buffer_iter[cpu]) |
2897 | ring_buffer_read_finish(iter->buffer_iter[cpu]); | 3017 | ring_buffer_read_finish(iter->buffer_iter[cpu]); |
@@ -2910,20 +3030,49 @@ static int tracing_release(struct inode *inode, struct file *file) | |||
2910 | kfree(iter->trace); | 3030 | kfree(iter->trace); |
2911 | kfree(iter->buffer_iter); | 3031 | kfree(iter->buffer_iter); |
2912 | seq_release_private(inode, file); | 3032 | seq_release_private(inode, file); |
3033 | |||
3034 | return 0; | ||
3035 | } | ||
3036 | |||
3037 | static int tracing_release_generic_tr(struct inode *inode, struct file *file) | ||
3038 | { | ||
3039 | struct trace_array *tr = inode->i_private; | ||
3040 | |||
3041 | trace_array_put(tr); | ||
2913 | return 0; | 3042 | return 0; |
2914 | } | 3043 | } |
2915 | 3044 | ||
3045 | static int tracing_release_generic_tc(struct inode *inode, struct file *file) | ||
3046 | { | ||
3047 | struct trace_cpu *tc = inode->i_private; | ||
3048 | struct trace_array *tr = tc->tr; | ||
3049 | |||
3050 | trace_array_put(tr); | ||
3051 | return 0; | ||
3052 | } | ||
3053 | |||
3054 | static int tracing_single_release_tr(struct inode *inode, struct file *file) | ||
3055 | { | ||
3056 | struct trace_array *tr = inode->i_private; | ||
3057 | |||
3058 | trace_array_put(tr); | ||
3059 | |||
3060 | return single_release(inode, file); | ||
3061 | } | ||
3062 | |||
2916 | static int tracing_open(struct inode *inode, struct file *file) | 3063 | static int tracing_open(struct inode *inode, struct file *file) |
2917 | { | 3064 | { |
3065 | struct trace_cpu *tc = inode->i_private; | ||
3066 | struct trace_array *tr = tc->tr; | ||
2918 | struct trace_iterator *iter; | 3067 | struct trace_iterator *iter; |
2919 | int ret = 0; | 3068 | int ret = 0; |
2920 | 3069 | ||
3070 | if (trace_array_get(tr) < 0) | ||
3071 | return -ENODEV; | ||
3072 | |||
2921 | /* If this file was open for write, then erase contents */ | 3073 | /* If this file was open for write, then erase contents */ |
2922 | if ((file->f_mode & FMODE_WRITE) && | 3074 | if ((file->f_mode & FMODE_WRITE) && |
2923 | (file->f_flags & O_TRUNC)) { | 3075 | (file->f_flags & O_TRUNC)) { |
2924 | struct trace_cpu *tc = inode->i_private; | ||
2925 | struct trace_array *tr = tc->tr; | ||
2926 | |||
2927 | if (tc->cpu == RING_BUFFER_ALL_CPUS) | 3076 | if (tc->cpu == RING_BUFFER_ALL_CPUS) |
2928 | tracing_reset_online_cpus(&tr->trace_buffer); | 3077 | tracing_reset_online_cpus(&tr->trace_buffer); |
2929 | else | 3078 | else |
@@ -2931,12 +3080,16 @@ static int tracing_open(struct inode *inode, struct file *file) | |||
2931 | } | 3080 | } |
2932 | 3081 | ||
2933 | if (file->f_mode & FMODE_READ) { | 3082 | if (file->f_mode & FMODE_READ) { |
2934 | iter = __tracing_open(inode, file, false); | 3083 | iter = __tracing_open(tr, tc, inode, file, false); |
2935 | if (IS_ERR(iter)) | 3084 | if (IS_ERR(iter)) |
2936 | ret = PTR_ERR(iter); | 3085 | ret = PTR_ERR(iter); |
2937 | else if (trace_flags & TRACE_ITER_LATENCY_FMT) | 3086 | else if (trace_flags & TRACE_ITER_LATENCY_FMT) |
2938 | iter->iter_flags |= TRACE_FILE_LAT_FMT; | 3087 | iter->iter_flags |= TRACE_FILE_LAT_FMT; |
2939 | } | 3088 | } |
3089 | |||
3090 | if (ret < 0) | ||
3091 | trace_array_put(tr); | ||
3092 | |||
2940 | return ret; | 3093 | return ret; |
2941 | } | 3094 | } |
2942 | 3095 | ||
@@ -3293,9 +3446,14 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, | |||
3293 | 3446 | ||
3294 | static int tracing_trace_options_open(struct inode *inode, struct file *file) | 3447 | static int tracing_trace_options_open(struct inode *inode, struct file *file) |
3295 | { | 3448 | { |
3449 | struct trace_array *tr = inode->i_private; | ||
3450 | |||
3296 | if (tracing_disabled) | 3451 | if (tracing_disabled) |
3297 | return -ENODEV; | 3452 | return -ENODEV; |
3298 | 3453 | ||
3454 | if (trace_array_get(tr) < 0) | ||
3455 | return -ENODEV; | ||
3456 | |||
3299 | return single_open(file, tracing_trace_options_show, inode->i_private); | 3457 | return single_open(file, tracing_trace_options_show, inode->i_private); |
3300 | } | 3458 | } |
3301 | 3459 | ||
@@ -3303,7 +3461,7 @@ static const struct file_operations tracing_iter_fops = { | |||
3303 | .open = tracing_trace_options_open, | 3461 | .open = tracing_trace_options_open, |
3304 | .read = seq_read, | 3462 | .read = seq_read, |
3305 | .llseek = seq_lseek, | 3463 | .llseek = seq_lseek, |
3306 | .release = single_release, | 3464 | .release = tracing_single_release_tr, |
3307 | .write = tracing_trace_options_write, | 3465 | .write = tracing_trace_options_write, |
3308 | }; | 3466 | }; |
3309 | 3467 | ||
@@ -3791,6 +3949,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) | |||
3791 | if (tracing_disabled) | 3949 | if (tracing_disabled) |
3792 | return -ENODEV; | 3950 | return -ENODEV; |
3793 | 3951 | ||
3952 | if (trace_array_get(tr) < 0) | ||
3953 | return -ENODEV; | ||
3954 | |||
3794 | mutex_lock(&trace_types_lock); | 3955 | mutex_lock(&trace_types_lock); |
3795 | 3956 | ||
3796 | /* create a buffer to store the information to pass to userspace */ | 3957 | /* create a buffer to store the information to pass to userspace */ |
@@ -3843,6 +4004,7 @@ out: | |||
3843 | fail: | 4004 | fail: |
3844 | kfree(iter->trace); | 4005 | kfree(iter->trace); |
3845 | kfree(iter); | 4006 | kfree(iter); |
4007 | __trace_array_put(tr); | ||
3846 | mutex_unlock(&trace_types_lock); | 4008 | mutex_unlock(&trace_types_lock); |
3847 | return ret; | 4009 | return ret; |
3848 | } | 4010 | } |
@@ -3850,6 +4012,8 @@ fail: | |||
3850 | static int tracing_release_pipe(struct inode *inode, struct file *file) | 4012 | static int tracing_release_pipe(struct inode *inode, struct file *file) |
3851 | { | 4013 | { |
3852 | struct trace_iterator *iter = file->private_data; | 4014 | struct trace_iterator *iter = file->private_data; |
4015 | struct trace_cpu *tc = inode->i_private; | ||
4016 | struct trace_array *tr = tc->tr; | ||
3853 | 4017 | ||
3854 | mutex_lock(&trace_types_lock); | 4018 | mutex_lock(&trace_types_lock); |
3855 | 4019 | ||
@@ -3863,6 +4027,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) | |||
3863 | kfree(iter->trace); | 4027 | kfree(iter->trace); |
3864 | kfree(iter); | 4028 | kfree(iter); |
3865 | 4029 | ||
4030 | trace_array_put(tr); | ||
4031 | |||
3866 | return 0; | 4032 | return 0; |
3867 | } | 4033 | } |
3868 | 4034 | ||
@@ -3939,7 +4105,7 @@ static int tracing_wait_pipe(struct file *filp) | |||
3939 | * | 4105 | * |
3940 | * iter->pos will be 0 if we haven't read anything. | 4106 | * iter->pos will be 0 if we haven't read anything. |
3941 | */ | 4107 | */ |
3942 | if (!tracing_is_enabled() && iter->pos) | 4108 | if (!tracing_is_on() && iter->pos) |
3943 | break; | 4109 | break; |
3944 | } | 4110 | } |
3945 | 4111 | ||
@@ -4320,6 +4486,8 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) | |||
4320 | /* resize the ring buffer to 0 */ | 4486 | /* resize the ring buffer to 0 */ |
4321 | tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); | 4487 | tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); |
4322 | 4488 | ||
4489 | trace_array_put(tr); | ||
4490 | |||
4323 | return 0; | 4491 | return 0; |
4324 | } | 4492 | } |
4325 | 4493 | ||
@@ -4328,6 +4496,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
4328 | size_t cnt, loff_t *fpos) | 4496 | size_t cnt, loff_t *fpos) |
4329 | { | 4497 | { |
4330 | unsigned long addr = (unsigned long)ubuf; | 4498 | unsigned long addr = (unsigned long)ubuf; |
4499 | struct trace_array *tr = filp->private_data; | ||
4331 | struct ring_buffer_event *event; | 4500 | struct ring_buffer_event *event; |
4332 | struct ring_buffer *buffer; | 4501 | struct ring_buffer *buffer; |
4333 | struct print_entry *entry; | 4502 | struct print_entry *entry; |
@@ -4387,7 +4556,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
4387 | 4556 | ||
4388 | local_save_flags(irq_flags); | 4557 | local_save_flags(irq_flags); |
4389 | size = sizeof(*entry) + cnt + 2; /* possible \n added */ | 4558 | size = sizeof(*entry) + cnt + 2; /* possible \n added */ |
4390 | buffer = global_trace.trace_buffer.buffer; | 4559 | buffer = tr->trace_buffer.buffer; |
4391 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, | 4560 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, |
4392 | irq_flags, preempt_count()); | 4561 | irq_flags, preempt_count()); |
4393 | if (!event) { | 4562 | if (!event) { |
@@ -4495,10 +4664,20 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, | |||
4495 | 4664 | ||
4496 | static int tracing_clock_open(struct inode *inode, struct file *file) | 4665 | static int tracing_clock_open(struct inode *inode, struct file *file) |
4497 | { | 4666 | { |
4667 | struct trace_array *tr = inode->i_private; | ||
4668 | int ret; | ||
4669 | |||
4498 | if (tracing_disabled) | 4670 | if (tracing_disabled) |
4499 | return -ENODEV; | 4671 | return -ENODEV; |
4500 | 4672 | ||
4501 | return single_open(file, tracing_clock_show, inode->i_private); | 4673 | if (trace_array_get(tr)) |
4674 | return -ENODEV; | ||
4675 | |||
4676 | ret = single_open(file, tracing_clock_show, inode->i_private); | ||
4677 | if (ret < 0) | ||
4678 | trace_array_put(tr); | ||
4679 | |||
4680 | return ret; | ||
4502 | } | 4681 | } |
4503 | 4682 | ||
4504 | struct ftrace_buffer_info { | 4683 | struct ftrace_buffer_info { |
@@ -4511,12 +4690,16 @@ struct ftrace_buffer_info { | |||
4511 | static int tracing_snapshot_open(struct inode *inode, struct file *file) | 4690 | static int tracing_snapshot_open(struct inode *inode, struct file *file) |
4512 | { | 4691 | { |
4513 | struct trace_cpu *tc = inode->i_private; | 4692 | struct trace_cpu *tc = inode->i_private; |
4693 | struct trace_array *tr = tc->tr; | ||
4514 | struct trace_iterator *iter; | 4694 | struct trace_iterator *iter; |
4515 | struct seq_file *m; | 4695 | struct seq_file *m; |
4516 | int ret = 0; | 4696 | int ret = 0; |
4517 | 4697 | ||
4698 | if (trace_array_get(tr) < 0) | ||
4699 | return -ENODEV; | ||
4700 | |||
4518 | if (file->f_mode & FMODE_READ) { | 4701 | if (file->f_mode & FMODE_READ) { |
4519 | iter = __tracing_open(inode, file, true); | 4702 | iter = __tracing_open(tr, tc, inode, file, true); |
4520 | if (IS_ERR(iter)) | 4703 | if (IS_ERR(iter)) |
4521 | ret = PTR_ERR(iter); | 4704 | ret = PTR_ERR(iter); |
4522 | } else { | 4705 | } else { |
@@ -4529,13 +4712,16 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file) | |||
4529 | kfree(m); | 4712 | kfree(m); |
4530 | return -ENOMEM; | 4713 | return -ENOMEM; |
4531 | } | 4714 | } |
4532 | iter->tr = tc->tr; | 4715 | iter->tr = tr; |
4533 | iter->trace_buffer = &tc->tr->max_buffer; | 4716 | iter->trace_buffer = &tc->tr->max_buffer; |
4534 | iter->cpu_file = tc->cpu; | 4717 | iter->cpu_file = tc->cpu; |
4535 | m->private = iter; | 4718 | m->private = iter; |
4536 | file->private_data = m; | 4719 | file->private_data = m; |
4537 | } | 4720 | } |
4538 | 4721 | ||
4722 | if (ret < 0) | ||
4723 | trace_array_put(tr); | ||
4724 | |||
4539 | return ret; | 4725 | return ret; |
4540 | } | 4726 | } |
4541 | 4727 | ||
@@ -4616,9 +4802,12 @@ out: | |||
4616 | static int tracing_snapshot_release(struct inode *inode, struct file *file) | 4802 | static int tracing_snapshot_release(struct inode *inode, struct file *file) |
4617 | { | 4803 | { |
4618 | struct seq_file *m = file->private_data; | 4804 | struct seq_file *m = file->private_data; |
4805 | int ret; | ||
4806 | |||
4807 | ret = tracing_release(inode, file); | ||
4619 | 4808 | ||
4620 | if (file->f_mode & FMODE_READ) | 4809 | if (file->f_mode & FMODE_READ) |
4621 | return tracing_release(inode, file); | 4810 | return ret; |
4622 | 4811 | ||
4623 | /* If write only, the seq_file is just a stub */ | 4812 | /* If write only, the seq_file is just a stub */ |
4624 | if (m) | 4813 | if (m) |
@@ -4684,34 +4873,38 @@ static const struct file_operations tracing_pipe_fops = { | |||
4684 | }; | 4873 | }; |
4685 | 4874 | ||
4686 | static const struct file_operations tracing_entries_fops = { | 4875 | static const struct file_operations tracing_entries_fops = { |
4687 | .open = tracing_open_generic, | 4876 | .open = tracing_open_generic_tc, |
4688 | .read = tracing_entries_read, | 4877 | .read = tracing_entries_read, |
4689 | .write = tracing_entries_write, | 4878 | .write = tracing_entries_write, |
4690 | .llseek = generic_file_llseek, | 4879 | .llseek = generic_file_llseek, |
4880 | .release = tracing_release_generic_tc, | ||
4691 | }; | 4881 | }; |
4692 | 4882 | ||
4693 | static const struct file_operations tracing_total_entries_fops = { | 4883 | static const struct file_operations tracing_total_entries_fops = { |
4694 | .open = tracing_open_generic, | 4884 | .open = tracing_open_generic_tr, |
4695 | .read = tracing_total_entries_read, | 4885 | .read = tracing_total_entries_read, |
4696 | .llseek = generic_file_llseek, | 4886 | .llseek = generic_file_llseek, |
4887 | .release = tracing_release_generic_tr, | ||
4697 | }; | 4888 | }; |
4698 | 4889 | ||
4699 | static const struct file_operations tracing_free_buffer_fops = { | 4890 | static const struct file_operations tracing_free_buffer_fops = { |
4891 | .open = tracing_open_generic_tr, | ||
4700 | .write = tracing_free_buffer_write, | 4892 | .write = tracing_free_buffer_write, |
4701 | .release = tracing_free_buffer_release, | 4893 | .release = tracing_free_buffer_release, |
4702 | }; | 4894 | }; |
4703 | 4895 | ||
4704 | static const struct file_operations tracing_mark_fops = { | 4896 | static const struct file_operations tracing_mark_fops = { |
4705 | .open = tracing_open_generic, | 4897 | .open = tracing_open_generic_tr, |
4706 | .write = tracing_mark_write, | 4898 | .write = tracing_mark_write, |
4707 | .llseek = generic_file_llseek, | 4899 | .llseek = generic_file_llseek, |
4900 | .release = tracing_release_generic_tr, | ||
4708 | }; | 4901 | }; |
4709 | 4902 | ||
4710 | static const struct file_operations trace_clock_fops = { | 4903 | static const struct file_operations trace_clock_fops = { |
4711 | .open = tracing_clock_open, | 4904 | .open = tracing_clock_open, |
4712 | .read = seq_read, | 4905 | .read = seq_read, |
4713 | .llseek = seq_lseek, | 4906 | .llseek = seq_lseek, |
4714 | .release = single_release, | 4907 | .release = tracing_single_release_tr, |
4715 | .write = tracing_clock_write, | 4908 | .write = tracing_clock_write, |
4716 | }; | 4909 | }; |
4717 | 4910 | ||
@@ -4739,13 +4932,19 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) | |||
4739 | struct trace_cpu *tc = inode->i_private; | 4932 | struct trace_cpu *tc = inode->i_private; |
4740 | struct trace_array *tr = tc->tr; | 4933 | struct trace_array *tr = tc->tr; |
4741 | struct ftrace_buffer_info *info; | 4934 | struct ftrace_buffer_info *info; |
4935 | int ret; | ||
4742 | 4936 | ||
4743 | if (tracing_disabled) | 4937 | if (tracing_disabled) |
4744 | return -ENODEV; | 4938 | return -ENODEV; |
4745 | 4939 | ||
4940 | if (trace_array_get(tr) < 0) | ||
4941 | return -ENODEV; | ||
4942 | |||
4746 | info = kzalloc(sizeof(*info), GFP_KERNEL); | 4943 | info = kzalloc(sizeof(*info), GFP_KERNEL); |
4747 | if (!info) | 4944 | if (!info) { |
4945 | trace_array_put(tr); | ||
4748 | return -ENOMEM; | 4946 | return -ENOMEM; |
4947 | } | ||
4749 | 4948 | ||
4750 | mutex_lock(&trace_types_lock); | 4949 | mutex_lock(&trace_types_lock); |
4751 | 4950 | ||
@@ -4763,7 +4962,11 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) | |||
4763 | 4962 | ||
4764 | mutex_unlock(&trace_types_lock); | 4963 | mutex_unlock(&trace_types_lock); |
4765 | 4964 | ||
4766 | return nonseekable_open(inode, filp); | 4965 | ret = nonseekable_open(inode, filp); |
4966 | if (ret < 0) | ||
4967 | trace_array_put(tr); | ||
4968 | |||
4969 | return ret; | ||
4767 | } | 4970 | } |
4768 | 4971 | ||
4769 | static unsigned int | 4972 | static unsigned int |
@@ -4863,8 +5066,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) | |||
4863 | 5066 | ||
4864 | mutex_lock(&trace_types_lock); | 5067 | mutex_lock(&trace_types_lock); |
4865 | 5068 | ||
4866 | WARN_ON(!iter->tr->ref); | 5069 | __trace_array_put(iter->tr); |
4867 | iter->tr->ref--; | ||
4868 | 5070 | ||
4869 | if (info->spare) | 5071 | if (info->spare) |
4870 | ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); | 5072 | ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); |
@@ -5612,15 +5814,10 @@ rb_simple_read(struct file *filp, char __user *ubuf, | |||
5612 | size_t cnt, loff_t *ppos) | 5814 | size_t cnt, loff_t *ppos) |
5613 | { | 5815 | { |
5614 | struct trace_array *tr = filp->private_data; | 5816 | struct trace_array *tr = filp->private_data; |
5615 | struct ring_buffer *buffer = tr->trace_buffer.buffer; | ||
5616 | char buf[64]; | 5817 | char buf[64]; |
5617 | int r; | 5818 | int r; |
5618 | 5819 | ||
5619 | if (buffer) | 5820 | r = tracer_tracing_is_on(tr); |
5620 | r = ring_buffer_record_is_on(buffer); | ||
5621 | else | ||
5622 | r = 0; | ||
5623 | |||
5624 | r = sprintf(buf, "%d\n", r); | 5821 | r = sprintf(buf, "%d\n", r); |
5625 | 5822 | ||
5626 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | 5823 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); |
@@ -5642,11 +5839,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf, | |||
5642 | if (buffer) { | 5839 | if (buffer) { |
5643 | mutex_lock(&trace_types_lock); | 5840 | mutex_lock(&trace_types_lock); |
5644 | if (val) { | 5841 | if (val) { |
5645 | ring_buffer_record_on(buffer); | 5842 | tracer_tracing_on(tr); |
5646 | if (tr->current_trace->start) | 5843 | if (tr->current_trace->start) |
5647 | tr->current_trace->start(tr); | 5844 | tr->current_trace->start(tr); |
5648 | } else { | 5845 | } else { |
5649 | ring_buffer_record_off(buffer); | 5846 | tracer_tracing_off(tr); |
5650 | if (tr->current_trace->stop) | 5847 | if (tr->current_trace->stop) |
5651 | tr->current_trace->stop(tr); | 5848 | tr->current_trace->stop(tr); |
5652 | } | 5849 | } |
@@ -5659,9 +5856,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf, | |||
5659 | } | 5856 | } |
5660 | 5857 | ||
5661 | static const struct file_operations rb_simple_fops = { | 5858 | static const struct file_operations rb_simple_fops = { |
5662 | .open = tracing_open_generic, | 5859 | .open = tracing_open_generic_tr, |
5663 | .read = rb_simple_read, | 5860 | .read = rb_simple_read, |
5664 | .write = rb_simple_write, | 5861 | .write = rb_simple_write, |
5862 | .release = tracing_release_generic_tr, | ||
5665 | .llseek = default_llseek, | 5863 | .llseek = default_llseek, |
5666 | }; | 5864 | }; |
5667 | 5865 | ||
@@ -5933,7 +6131,7 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) | |||
5933 | trace_create_file("buffer_total_size_kb", 0444, d_tracer, | 6131 | trace_create_file("buffer_total_size_kb", 0444, d_tracer, |
5934 | tr, &tracing_total_entries_fops); | 6132 | tr, &tracing_total_entries_fops); |
5935 | 6133 | ||
5936 | trace_create_file("free_buffer", 0644, d_tracer, | 6134 | trace_create_file("free_buffer", 0200, d_tracer, |
5937 | tr, &tracing_free_buffer_fops); | 6135 | tr, &tracing_free_buffer_fops); |
5938 | 6136 | ||
5939 | trace_create_file("trace_marker", 0220, d_tracer, | 6137 | trace_create_file("trace_marker", 0220, d_tracer, |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 20572ed88c5c..4a4f6e1828b6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -224,6 +224,11 @@ enum { | |||
224 | 224 | ||
225 | extern struct list_head ftrace_trace_arrays; | 225 | extern struct list_head ftrace_trace_arrays; |
226 | 226 | ||
227 | extern struct mutex trace_types_lock; | ||
228 | |||
229 | extern int trace_array_get(struct trace_array *tr); | ||
230 | extern void trace_array_put(struct trace_array *tr); | ||
231 | |||
227 | /* | 232 | /* |
228 | * The global tracer (top) should be the first trace array added, | 233 | * The global tracer (top) should be the first trace array added, |
229 | * but we check the flag anyway. | 234 | * but we check the flag anyway. |
@@ -554,11 +559,6 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu); | |||
554 | 559 | ||
555 | void poll_wait_pipe(struct trace_iterator *iter); | 560 | void poll_wait_pipe(struct trace_iterator *iter); |
556 | 561 | ||
557 | void ftrace(struct trace_array *tr, | ||
558 | struct trace_array_cpu *data, | ||
559 | unsigned long ip, | ||
560 | unsigned long parent_ip, | ||
561 | unsigned long flags, int pc); | ||
562 | void tracing_sched_switch_trace(struct trace_array *tr, | 562 | void tracing_sched_switch_trace(struct trace_array *tr, |
563 | struct task_struct *prev, | 563 | struct task_struct *prev, |
564 | struct task_struct *next, | 564 | struct task_struct *next, |
@@ -774,6 +774,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) | |||
774 | extern struct list_head ftrace_pids; | 774 | extern struct list_head ftrace_pids; |
775 | 775 | ||
776 | #ifdef CONFIG_FUNCTION_TRACER | 776 | #ifdef CONFIG_FUNCTION_TRACER |
777 | extern bool ftrace_filter_param __initdata; | ||
777 | static inline int ftrace_trace_task(struct task_struct *task) | 778 | static inline int ftrace_trace_task(struct task_struct *task) |
778 | { | 779 | { |
779 | if (list_empty(&ftrace_pids)) | 780 | if (list_empty(&ftrace_pids)) |
@@ -899,12 +900,6 @@ static inline void trace_branch_disable(void) | |||
899 | /* set ring buffers to default size if not already done so */ | 900 | /* set ring buffers to default size if not already done so */ |
900 | int tracing_update_buffers(void); | 901 | int tracing_update_buffers(void); |
901 | 902 | ||
902 | /* trace event type bit fields, not numeric */ | ||
903 | enum { | ||
904 | TRACE_EVENT_TYPE_PRINTF = 1, | ||
905 | TRACE_EVENT_TYPE_RAW = 2, | ||
906 | }; | ||
907 | |||
908 | struct ftrace_event_field { | 903 | struct ftrace_event_field { |
909 | struct list_head link; | 904 | struct list_head link; |
910 | const char *name; | 905 | const char *name; |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 27963e2bf4bf..7d854290bf81 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -41,6 +41,23 @@ static LIST_HEAD(ftrace_common_fields); | |||
41 | static struct kmem_cache *field_cachep; | 41 | static struct kmem_cache *field_cachep; |
42 | static struct kmem_cache *file_cachep; | 42 | static struct kmem_cache *file_cachep; |
43 | 43 | ||
44 | #define SYSTEM_FL_FREE_NAME (1 << 31) | ||
45 | |||
46 | static inline int system_refcount(struct event_subsystem *system) | ||
47 | { | ||
48 | return system->ref_count & ~SYSTEM_FL_FREE_NAME; | ||
49 | } | ||
50 | |||
51 | static int system_refcount_inc(struct event_subsystem *system) | ||
52 | { | ||
53 | return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME; | ||
54 | } | ||
55 | |||
56 | static int system_refcount_dec(struct event_subsystem *system) | ||
57 | { | ||
58 | return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME; | ||
59 | } | ||
60 | |||
44 | /* Double loops, do not use break, only goto's work */ | 61 | /* Double loops, do not use break, only goto's work */ |
45 | #define do_for_each_event_file(tr, file) \ | 62 | #define do_for_each_event_file(tr, file) \ |
46 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ | 63 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ |
@@ -97,7 +114,7 @@ static int __trace_define_field(struct list_head *head, const char *type, | |||
97 | 114 | ||
98 | field = kmem_cache_alloc(field_cachep, GFP_TRACE); | 115 | field = kmem_cache_alloc(field_cachep, GFP_TRACE); |
99 | if (!field) | 116 | if (!field) |
100 | goto err; | 117 | return -ENOMEM; |
101 | 118 | ||
102 | field->name = name; | 119 | field->name = name; |
103 | field->type = type; | 120 | field->type = type; |
@@ -114,11 +131,6 @@ static int __trace_define_field(struct list_head *head, const char *type, | |||
114 | list_add(&field->link, head); | 131 | list_add(&field->link, head); |
115 | 132 | ||
116 | return 0; | 133 | return 0; |
117 | |||
118 | err: | ||
119 | kmem_cache_free(field_cachep, field); | ||
120 | |||
121 | return -ENOMEM; | ||
122 | } | 134 | } |
123 | 135 | ||
124 | int trace_define_field(struct ftrace_event_call *call, const char *type, | 136 | int trace_define_field(struct ftrace_event_call *call, const char *type, |
@@ -279,9 +291,11 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, | |||
279 | } | 291 | } |
280 | call->class->reg(call, TRACE_REG_UNREGISTER, file); | 292 | call->class->reg(call, TRACE_REG_UNREGISTER, file); |
281 | } | 293 | } |
282 | /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */ | 294 | /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ |
283 | if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) | 295 | if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) |
284 | set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); | 296 | set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); |
297 | else | ||
298 | clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); | ||
285 | break; | 299 | break; |
286 | case 1: | 300 | case 1: |
287 | /* | 301 | /* |
@@ -349,8 +363,8 @@ static void __put_system(struct event_subsystem *system) | |||
349 | { | 363 | { |
350 | struct event_filter *filter = system->filter; | 364 | struct event_filter *filter = system->filter; |
351 | 365 | ||
352 | WARN_ON_ONCE(system->ref_count == 0); | 366 | WARN_ON_ONCE(system_refcount(system) == 0); |
353 | if (--system->ref_count) | 367 | if (system_refcount_dec(system)) |
354 | return; | 368 | return; |
355 | 369 | ||
356 | list_del(&system->list); | 370 | list_del(&system->list); |
@@ -359,13 +373,15 @@ static void __put_system(struct event_subsystem *system) | |||
359 | kfree(filter->filter_string); | 373 | kfree(filter->filter_string); |
360 | kfree(filter); | 374 | kfree(filter); |
361 | } | 375 | } |
376 | if (system->ref_count & SYSTEM_FL_FREE_NAME) | ||
377 | kfree(system->name); | ||
362 | kfree(system); | 378 | kfree(system); |
363 | } | 379 | } |
364 | 380 | ||
365 | static void __get_system(struct event_subsystem *system) | 381 | static void __get_system(struct event_subsystem *system) |
366 | { | 382 | { |
367 | WARN_ON_ONCE(system->ref_count == 0); | 383 | WARN_ON_ONCE(system_refcount(system) == 0); |
368 | system->ref_count++; | 384 | system_refcount_inc(system); |
369 | } | 385 | } |
370 | 386 | ||
371 | static void __get_system_dir(struct ftrace_subsystem_dir *dir) | 387 | static void __get_system_dir(struct ftrace_subsystem_dir *dir) |
@@ -379,7 +395,7 @@ static void __put_system_dir(struct ftrace_subsystem_dir *dir) | |||
379 | { | 395 | { |
380 | WARN_ON_ONCE(dir->ref_count == 0); | 396 | WARN_ON_ONCE(dir->ref_count == 0); |
381 | /* If the subsystem is about to be freed, the dir must be too */ | 397 | /* If the subsystem is about to be freed, the dir must be too */ |
382 | WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1); | 398 | WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1); |
383 | 399 | ||
384 | __put_system(dir->subsystem); | 400 | __put_system(dir->subsystem); |
385 | if (!--dir->ref_count) | 401 | if (!--dir->ref_count) |
@@ -394,16 +410,45 @@ static void put_system(struct ftrace_subsystem_dir *dir) | |||
394 | } | 410 | } |
395 | 411 | ||
396 | /* | 412 | /* |
413 | * Open and update trace_array ref count. | ||
414 | * Must have the current trace_array passed to it. | ||
415 | */ | ||
416 | static int tracing_open_generic_file(struct inode *inode, struct file *filp) | ||
417 | { | ||
418 | struct ftrace_event_file *file = inode->i_private; | ||
419 | struct trace_array *tr = file->tr; | ||
420 | int ret; | ||
421 | |||
422 | if (trace_array_get(tr) < 0) | ||
423 | return -ENODEV; | ||
424 | |||
425 | ret = tracing_open_generic(inode, filp); | ||
426 | if (ret < 0) | ||
427 | trace_array_put(tr); | ||
428 | return ret; | ||
429 | } | ||
430 | |||
431 | static int tracing_release_generic_file(struct inode *inode, struct file *filp) | ||
432 | { | ||
433 | struct ftrace_event_file *file = inode->i_private; | ||
434 | struct trace_array *tr = file->tr; | ||
435 | |||
436 | trace_array_put(tr); | ||
437 | |||
438 | return 0; | ||
439 | } | ||
440 | |||
441 | /* | ||
397 | * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. | 442 | * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. |
398 | */ | 443 | */ |
399 | static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, | 444 | static int |
400 | const char *sub, const char *event, int set) | 445 | __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, |
446 | const char *sub, const char *event, int set) | ||
401 | { | 447 | { |
402 | struct ftrace_event_file *file; | 448 | struct ftrace_event_file *file; |
403 | struct ftrace_event_call *call; | 449 | struct ftrace_event_call *call; |
404 | int ret = -EINVAL; | 450 | int ret = -EINVAL; |
405 | 451 | ||
406 | mutex_lock(&event_mutex); | ||
407 | list_for_each_entry(file, &tr->events, list) { | 452 | list_for_each_entry(file, &tr->events, list) { |
408 | 453 | ||
409 | call = file->event_call; | 454 | call = file->event_call; |
@@ -429,6 +474,17 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, | |||
429 | 474 | ||
430 | ret = 0; | 475 | ret = 0; |
431 | } | 476 | } |
477 | |||
478 | return ret; | ||
479 | } | ||
480 | |||
481 | static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, | ||
482 | const char *sub, const char *event, int set) | ||
483 | { | ||
484 | int ret; | ||
485 | |||
486 | mutex_lock(&event_mutex); | ||
487 | ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set); | ||
432 | mutex_unlock(&event_mutex); | 488 | mutex_unlock(&event_mutex); |
433 | 489 | ||
434 | return ret; | 490 | return ret; |
@@ -624,17 +680,17 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, | |||
624 | loff_t *ppos) | 680 | loff_t *ppos) |
625 | { | 681 | { |
626 | struct ftrace_event_file *file = filp->private_data; | 682 | struct ftrace_event_file *file = filp->private_data; |
627 | char *buf; | 683 | char buf[4] = "0"; |
628 | 684 | ||
629 | if (file->flags & FTRACE_EVENT_FL_ENABLED) { | 685 | if (file->flags & FTRACE_EVENT_FL_ENABLED && |
630 | if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED) | 686 | !(file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) |
631 | buf = "0*\n"; | 687 | strcpy(buf, "1"); |
632 | else if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) | 688 | |
633 | buf = "1*\n"; | 689 | if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED || |
634 | else | 690 | file->flags & FTRACE_EVENT_FL_SOFT_MODE) |
635 | buf = "1\n"; | 691 | strcat(buf, "*"); |
636 | } else | 692 | |
637 | buf = "0\n"; | 693 | strcat(buf, "\n"); |
638 | 694 | ||
639 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf)); | 695 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf)); |
640 | } | 696 | } |
@@ -992,6 +1048,7 @@ static int subsystem_open(struct inode *inode, struct file *filp) | |||
992 | int ret; | 1048 | int ret; |
993 | 1049 | ||
994 | /* Make sure the system still exists */ | 1050 | /* Make sure the system still exists */ |
1051 | mutex_lock(&trace_types_lock); | ||
995 | mutex_lock(&event_mutex); | 1052 | mutex_lock(&event_mutex); |
996 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { | 1053 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { |
997 | list_for_each_entry(dir, &tr->systems, list) { | 1054 | list_for_each_entry(dir, &tr->systems, list) { |
@@ -1007,6 +1064,7 @@ static int subsystem_open(struct inode *inode, struct file *filp) | |||
1007 | } | 1064 | } |
1008 | exit_loop: | 1065 | exit_loop: |
1009 | mutex_unlock(&event_mutex); | 1066 | mutex_unlock(&event_mutex); |
1067 | mutex_unlock(&trace_types_lock); | ||
1010 | 1068 | ||
1011 | if (!system) | 1069 | if (!system) |
1012 | return -ENODEV; | 1070 | return -ENODEV; |
@@ -1014,9 +1072,17 @@ static int subsystem_open(struct inode *inode, struct file *filp) | |||
1014 | /* Some versions of gcc think dir can be uninitialized here */ | 1072 | /* Some versions of gcc think dir can be uninitialized here */ |
1015 | WARN_ON(!dir); | 1073 | WARN_ON(!dir); |
1016 | 1074 | ||
1075 | /* Still need to increment the ref count of the system */ | ||
1076 | if (trace_array_get(tr) < 0) { | ||
1077 | put_system(dir); | ||
1078 | return -ENODEV; | ||
1079 | } | ||
1080 | |||
1017 | ret = tracing_open_generic(inode, filp); | 1081 | ret = tracing_open_generic(inode, filp); |
1018 | if (ret < 0) | 1082 | if (ret < 0) { |
1083 | trace_array_put(tr); | ||
1019 | put_system(dir); | 1084 | put_system(dir); |
1085 | } | ||
1020 | 1086 | ||
1021 | return ret; | 1087 | return ret; |
1022 | } | 1088 | } |
@@ -1027,16 +1093,23 @@ static int system_tr_open(struct inode *inode, struct file *filp) | |||
1027 | struct trace_array *tr = inode->i_private; | 1093 | struct trace_array *tr = inode->i_private; |
1028 | int ret; | 1094 | int ret; |
1029 | 1095 | ||
1096 | if (trace_array_get(tr) < 0) | ||
1097 | return -ENODEV; | ||
1098 | |||
1030 | /* Make a temporary dir that has no system but points to tr */ | 1099 | /* Make a temporary dir that has no system but points to tr */ |
1031 | dir = kzalloc(sizeof(*dir), GFP_KERNEL); | 1100 | dir = kzalloc(sizeof(*dir), GFP_KERNEL); |
1032 | if (!dir) | 1101 | if (!dir) { |
1102 | trace_array_put(tr); | ||
1033 | return -ENOMEM; | 1103 | return -ENOMEM; |
1104 | } | ||
1034 | 1105 | ||
1035 | dir->tr = tr; | 1106 | dir->tr = tr; |
1036 | 1107 | ||
1037 | ret = tracing_open_generic(inode, filp); | 1108 | ret = tracing_open_generic(inode, filp); |
1038 | if (ret < 0) | 1109 | if (ret < 0) { |
1110 | trace_array_put(tr); | ||
1039 | kfree(dir); | 1111 | kfree(dir); |
1112 | } | ||
1040 | 1113 | ||
1041 | filp->private_data = dir; | 1114 | filp->private_data = dir; |
1042 | 1115 | ||
@@ -1047,6 +1120,8 @@ static int subsystem_release(struct inode *inode, struct file *file) | |||
1047 | { | 1120 | { |
1048 | struct ftrace_subsystem_dir *dir = file->private_data; | 1121 | struct ftrace_subsystem_dir *dir = file->private_data; |
1049 | 1122 | ||
1123 | trace_array_put(dir->tr); | ||
1124 | |||
1050 | /* | 1125 | /* |
1051 | * If dir->subsystem is NULL, then this is a temporary | 1126 | * If dir->subsystem is NULL, then this is a temporary |
1052 | * descriptor that was made for a trace_array to enable | 1127 | * descriptor that was made for a trace_array to enable |
@@ -1174,9 +1249,10 @@ static const struct file_operations ftrace_set_event_fops = { | |||
1174 | }; | 1249 | }; |
1175 | 1250 | ||
1176 | static const struct file_operations ftrace_enable_fops = { | 1251 | static const struct file_operations ftrace_enable_fops = { |
1177 | .open = tracing_open_generic, | 1252 | .open = tracing_open_generic_file, |
1178 | .read = event_enable_read, | 1253 | .read = event_enable_read, |
1179 | .write = event_enable_write, | 1254 | .write = event_enable_write, |
1255 | .release = tracing_release_generic_file, | ||
1180 | .llseek = default_llseek, | 1256 | .llseek = default_llseek, |
1181 | }; | 1257 | }; |
1182 | 1258 | ||
@@ -1279,7 +1355,15 @@ create_new_subsystem(const char *name) | |||
1279 | return NULL; | 1355 | return NULL; |
1280 | 1356 | ||
1281 | system->ref_count = 1; | 1357 | system->ref_count = 1; |
1282 | system->name = name; | 1358 | |
1359 | /* Only allocate if dynamic (kprobes and modules) */ | ||
1360 | if (!core_kernel_data((unsigned long)name)) { | ||
1361 | system->ref_count |= SYSTEM_FL_FREE_NAME; | ||
1362 | system->name = kstrdup(name, GFP_KERNEL); | ||
1363 | if (!system->name) | ||
1364 | goto out_free; | ||
1365 | } else | ||
1366 | system->name = name; | ||
1283 | 1367 | ||
1284 | system->filter = NULL; | 1368 | system->filter = NULL; |
1285 | 1369 | ||
@@ -1292,6 +1376,8 @@ create_new_subsystem(const char *name) | |||
1292 | return system; | 1376 | return system; |
1293 | 1377 | ||
1294 | out_free: | 1378 | out_free: |
1379 | if (system->ref_count & SYSTEM_FL_FREE_NAME) | ||
1380 | kfree(system->name); | ||
1295 | kfree(system); | 1381 | kfree(system); |
1296 | return NULL; | 1382 | return NULL; |
1297 | } | 1383 | } |
@@ -1591,6 +1677,7 @@ static void __add_event_to_tracers(struct ftrace_event_call *call, | |||
1591 | int trace_add_event_call(struct ftrace_event_call *call) | 1677 | int trace_add_event_call(struct ftrace_event_call *call) |
1592 | { | 1678 | { |
1593 | int ret; | 1679 | int ret; |
1680 | mutex_lock(&trace_types_lock); | ||
1594 | mutex_lock(&event_mutex); | 1681 | mutex_lock(&event_mutex); |
1595 | 1682 | ||
1596 | ret = __register_event(call, NULL); | 1683 | ret = __register_event(call, NULL); |
@@ -1598,11 +1685,13 @@ int trace_add_event_call(struct ftrace_event_call *call) | |||
1598 | __add_event_to_tracers(call, NULL); | 1685 | __add_event_to_tracers(call, NULL); |
1599 | 1686 | ||
1600 | mutex_unlock(&event_mutex); | 1687 | mutex_unlock(&event_mutex); |
1688 | mutex_unlock(&trace_types_lock); | ||
1601 | return ret; | 1689 | return ret; |
1602 | } | 1690 | } |
1603 | 1691 | ||
1604 | /* | 1692 | /* |
1605 | * Must be called under locking both of event_mutex and trace_event_sem. | 1693 | * Must be called under locking of trace_types_lock, event_mutex and |
1694 | * trace_event_sem. | ||
1606 | */ | 1695 | */ |
1607 | static void __trace_remove_event_call(struct ftrace_event_call *call) | 1696 | static void __trace_remove_event_call(struct ftrace_event_call *call) |
1608 | { | 1697 | { |
@@ -1614,11 +1703,13 @@ static void __trace_remove_event_call(struct ftrace_event_call *call) | |||
1614 | /* Remove an event_call */ | 1703 | /* Remove an event_call */ |
1615 | void trace_remove_event_call(struct ftrace_event_call *call) | 1704 | void trace_remove_event_call(struct ftrace_event_call *call) |
1616 | { | 1705 | { |
1706 | mutex_lock(&trace_types_lock); | ||
1617 | mutex_lock(&event_mutex); | 1707 | mutex_lock(&event_mutex); |
1618 | down_write(&trace_event_sem); | 1708 | down_write(&trace_event_sem); |
1619 | __trace_remove_event_call(call); | 1709 | __trace_remove_event_call(call); |
1620 | up_write(&trace_event_sem); | 1710 | up_write(&trace_event_sem); |
1621 | mutex_unlock(&event_mutex); | 1711 | mutex_unlock(&event_mutex); |
1712 | mutex_unlock(&trace_types_lock); | ||
1622 | } | 1713 | } |
1623 | 1714 | ||
1624 | #define for_each_event(event, start, end) \ | 1715 | #define for_each_event(event, start, end) \ |
@@ -1762,6 +1853,7 @@ static int trace_module_notify(struct notifier_block *self, | |||
1762 | { | 1853 | { |
1763 | struct module *mod = data; | 1854 | struct module *mod = data; |
1764 | 1855 | ||
1856 | mutex_lock(&trace_types_lock); | ||
1765 | mutex_lock(&event_mutex); | 1857 | mutex_lock(&event_mutex); |
1766 | switch (val) { | 1858 | switch (val) { |
1767 | case MODULE_STATE_COMING: | 1859 | case MODULE_STATE_COMING: |
@@ -1772,6 +1864,7 @@ static int trace_module_notify(struct notifier_block *self, | |||
1772 | break; | 1864 | break; |
1773 | } | 1865 | } |
1774 | mutex_unlock(&event_mutex); | 1866 | mutex_unlock(&event_mutex); |
1867 | mutex_unlock(&trace_types_lock); | ||
1775 | 1868 | ||
1776 | return 0; | 1869 | return 0; |
1777 | } | 1870 | } |
@@ -2011,10 +2104,7 @@ event_enable_func(struct ftrace_hash *hash, | |||
2011 | int ret; | 2104 | int ret; |
2012 | 2105 | ||
2013 | /* hash funcs only work with set_ftrace_filter */ | 2106 | /* hash funcs only work with set_ftrace_filter */ |
2014 | if (!enabled) | 2107 | if (!enabled || !param) |
2015 | return -EINVAL; | ||
2016 | |||
2017 | if (!param) | ||
2018 | return -EINVAL; | 2108 | return -EINVAL; |
2019 | 2109 | ||
2020 | system = strsep(¶m, ":"); | 2110 | system = strsep(¶m, ":"); |
@@ -2329,11 +2419,11 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr) | |||
2329 | 2419 | ||
2330 | int event_trace_del_tracer(struct trace_array *tr) | 2420 | int event_trace_del_tracer(struct trace_array *tr) |
2331 | { | 2421 | { |
2332 | /* Disable any running events */ | ||
2333 | __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); | ||
2334 | |||
2335 | mutex_lock(&event_mutex); | 2422 | mutex_lock(&event_mutex); |
2336 | 2423 | ||
2424 | /* Disable any running events */ | ||
2425 | __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); | ||
2426 | |||
2337 | down_write(&trace_event_sem); | 2427 | down_write(&trace_event_sem); |
2338 | __trace_remove_event_dirs(tr); | 2428 | __trace_remove_event_dirs(tr); |
2339 | debugfs_remove_recursive(tr->event_dir); | 2429 | debugfs_remove_recursive(tr->event_dir); |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e1b653f7e1ca..0d883dc057d6 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -44,6 +44,7 @@ enum filter_op_ids | |||
44 | OP_LE, | 44 | OP_LE, |
45 | OP_GT, | 45 | OP_GT, |
46 | OP_GE, | 46 | OP_GE, |
47 | OP_BAND, | ||
47 | OP_NONE, | 48 | OP_NONE, |
48 | OP_OPEN_PAREN, | 49 | OP_OPEN_PAREN, |
49 | }; | 50 | }; |
@@ -54,6 +55,7 @@ struct filter_op { | |||
54 | int precedence; | 55 | int precedence; |
55 | }; | 56 | }; |
56 | 57 | ||
58 | /* Order must be the same as enum filter_op_ids above */ | ||
57 | static struct filter_op filter_ops[] = { | 59 | static struct filter_op filter_ops[] = { |
58 | { OP_OR, "||", 1 }, | 60 | { OP_OR, "||", 1 }, |
59 | { OP_AND, "&&", 2 }, | 61 | { OP_AND, "&&", 2 }, |
@@ -64,6 +66,7 @@ static struct filter_op filter_ops[] = { | |||
64 | { OP_LE, "<=", 5 }, | 66 | { OP_LE, "<=", 5 }, |
65 | { OP_GT, ">", 5 }, | 67 | { OP_GT, ">", 5 }, |
66 | { OP_GE, ">=", 5 }, | 68 | { OP_GE, ">=", 5 }, |
69 | { OP_BAND, "&", 6 }, | ||
67 | { OP_NONE, "OP_NONE", 0 }, | 70 | { OP_NONE, "OP_NONE", 0 }, |
68 | { OP_OPEN_PAREN, "(", 0 }, | 71 | { OP_OPEN_PAREN, "(", 0 }, |
69 | }; | 72 | }; |
@@ -156,6 +159,9 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \ | |||
156 | case OP_GE: \ | 159 | case OP_GE: \ |
157 | match = (*addr >= val); \ | 160 | match = (*addr >= val); \ |
158 | break; \ | 161 | break; \ |
162 | case OP_BAND: \ | ||
163 | match = (*addr & val); \ | ||
164 | break; \ | ||
159 | default: \ | 165 | default: \ |
160 | break; \ | 166 | break; \ |
161 | } \ | 167 | } \ |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c4d6d7191988..b863f93b30f3 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
@@ -290,6 +290,21 @@ ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) | |||
290 | trace_dump_stack(STACK_SKIP); | 290 | trace_dump_stack(STACK_SKIP); |
291 | } | 291 | } |
292 | 292 | ||
293 | static void | ||
294 | ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) | ||
295 | { | ||
296 | if (update_count(data)) | ||
297 | ftrace_dump(DUMP_ALL); | ||
298 | } | ||
299 | |||
300 | /* Only dump the current CPU buffer. */ | ||
301 | static void | ||
302 | ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data) | ||
303 | { | ||
304 | if (update_count(data)) | ||
305 | ftrace_dump(DUMP_ORIG); | ||
306 | } | ||
307 | |||
293 | static int | 308 | static int |
294 | ftrace_probe_print(const char *name, struct seq_file *m, | 309 | ftrace_probe_print(const char *name, struct seq_file *m, |
295 | unsigned long ip, void *data) | 310 | unsigned long ip, void *data) |
@@ -327,6 +342,20 @@ ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, | |||
327 | return ftrace_probe_print("stacktrace", m, ip, data); | 342 | return ftrace_probe_print("stacktrace", m, ip, data); |
328 | } | 343 | } |
329 | 344 | ||
345 | static int | ||
346 | ftrace_dump_print(struct seq_file *m, unsigned long ip, | ||
347 | struct ftrace_probe_ops *ops, void *data) | ||
348 | { | ||
349 | return ftrace_probe_print("dump", m, ip, data); | ||
350 | } | ||
351 | |||
352 | static int | ||
353 | ftrace_cpudump_print(struct seq_file *m, unsigned long ip, | ||
354 | struct ftrace_probe_ops *ops, void *data) | ||
355 | { | ||
356 | return ftrace_probe_print("cpudump", m, ip, data); | ||
357 | } | ||
358 | |||
330 | static struct ftrace_probe_ops traceon_count_probe_ops = { | 359 | static struct ftrace_probe_ops traceon_count_probe_ops = { |
331 | .func = ftrace_traceon_count, | 360 | .func = ftrace_traceon_count, |
332 | .print = ftrace_traceon_print, | 361 | .print = ftrace_traceon_print, |
@@ -342,6 +371,16 @@ static struct ftrace_probe_ops stacktrace_count_probe_ops = { | |||
342 | .print = ftrace_stacktrace_print, | 371 | .print = ftrace_stacktrace_print, |
343 | }; | 372 | }; |
344 | 373 | ||
374 | static struct ftrace_probe_ops dump_probe_ops = { | ||
375 | .func = ftrace_dump_probe, | ||
376 | .print = ftrace_dump_print, | ||
377 | }; | ||
378 | |||
379 | static struct ftrace_probe_ops cpudump_probe_ops = { | ||
380 | .func = ftrace_cpudump_probe, | ||
381 | .print = ftrace_cpudump_print, | ||
382 | }; | ||
383 | |||
345 | static struct ftrace_probe_ops traceon_probe_ops = { | 384 | static struct ftrace_probe_ops traceon_probe_ops = { |
346 | .func = ftrace_traceon, | 385 | .func = ftrace_traceon, |
347 | .print = ftrace_traceon_print, | 386 | .print = ftrace_traceon_print, |
@@ -425,6 +464,32 @@ ftrace_stacktrace_callback(struct ftrace_hash *hash, | |||
425 | param, enable); | 464 | param, enable); |
426 | } | 465 | } |
427 | 466 | ||
467 | static int | ||
468 | ftrace_dump_callback(struct ftrace_hash *hash, | ||
469 | char *glob, char *cmd, char *param, int enable) | ||
470 | { | ||
471 | struct ftrace_probe_ops *ops; | ||
472 | |||
473 | ops = &dump_probe_ops; | ||
474 | |||
475 | /* Only dump once. */ | ||
476 | return ftrace_trace_probe_callback(ops, hash, glob, cmd, | ||
477 | "1", enable); | ||
478 | } | ||
479 | |||
480 | static int | ||
481 | ftrace_cpudump_callback(struct ftrace_hash *hash, | ||
482 | char *glob, char *cmd, char *param, int enable) | ||
483 | { | ||
484 | struct ftrace_probe_ops *ops; | ||
485 | |||
486 | ops = &cpudump_probe_ops; | ||
487 | |||
488 | /* Only dump once. */ | ||
489 | return ftrace_trace_probe_callback(ops, hash, glob, cmd, | ||
490 | "1", enable); | ||
491 | } | ||
492 | |||
428 | static struct ftrace_func_command ftrace_traceon_cmd = { | 493 | static struct ftrace_func_command ftrace_traceon_cmd = { |
429 | .name = "traceon", | 494 | .name = "traceon", |
430 | .func = ftrace_trace_onoff_callback, | 495 | .func = ftrace_trace_onoff_callback, |
@@ -440,6 +505,16 @@ static struct ftrace_func_command ftrace_stacktrace_cmd = { | |||
440 | .func = ftrace_stacktrace_callback, | 505 | .func = ftrace_stacktrace_callback, |
441 | }; | 506 | }; |
442 | 507 | ||
508 | static struct ftrace_func_command ftrace_dump_cmd = { | ||
509 | .name = "dump", | ||
510 | .func = ftrace_dump_callback, | ||
511 | }; | ||
512 | |||
513 | static struct ftrace_func_command ftrace_cpudump_cmd = { | ||
514 | .name = "cpudump", | ||
515 | .func = ftrace_cpudump_callback, | ||
516 | }; | ||
517 | |||
443 | static int __init init_func_cmd_traceon(void) | 518 | static int __init init_func_cmd_traceon(void) |
444 | { | 519 | { |
445 | int ret; | 520 | int ret; |
@@ -450,13 +525,31 @@ static int __init init_func_cmd_traceon(void) | |||
450 | 525 | ||
451 | ret = register_ftrace_command(&ftrace_traceon_cmd); | 526 | ret = register_ftrace_command(&ftrace_traceon_cmd); |
452 | if (ret) | 527 | if (ret) |
453 | unregister_ftrace_command(&ftrace_traceoff_cmd); | 528 | goto out_free_traceoff; |
454 | 529 | ||
455 | ret = register_ftrace_command(&ftrace_stacktrace_cmd); | 530 | ret = register_ftrace_command(&ftrace_stacktrace_cmd); |
456 | if (ret) { | 531 | if (ret) |
457 | unregister_ftrace_command(&ftrace_traceoff_cmd); | 532 | goto out_free_traceon; |
458 | unregister_ftrace_command(&ftrace_traceon_cmd); | 533 | |
459 | } | 534 | ret = register_ftrace_command(&ftrace_dump_cmd); |
535 | if (ret) | ||
536 | goto out_free_stacktrace; | ||
537 | |||
538 | ret = register_ftrace_command(&ftrace_cpudump_cmd); | ||
539 | if (ret) | ||
540 | goto out_free_dump; | ||
541 | |||
542 | return 0; | ||
543 | |||
544 | out_free_dump: | ||
545 | unregister_ftrace_command(&ftrace_dump_cmd); | ||
546 | out_free_stacktrace: | ||
547 | unregister_ftrace_command(&ftrace_stacktrace_cmd); | ||
548 | out_free_traceon: | ||
549 | unregister_ftrace_command(&ftrace_traceon_cmd); | ||
550 | out_free_traceoff: | ||
551 | unregister_ftrace_command(&ftrace_traceoff_cmd); | ||
552 | |||
460 | return ret; | 553 | return ret; |
461 | } | 554 | } |
462 | #else | 555 | #else |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index b19d065a28cb..2aefbee93a6d 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
@@ -373,7 +373,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) | |||
373 | struct trace_array_cpu *data; | 373 | struct trace_array_cpu *data; |
374 | unsigned long flags; | 374 | unsigned long flags; |
375 | 375 | ||
376 | if (likely(!tracer_enabled)) | 376 | if (!tracer_enabled || !tracing_is_enabled()) |
377 | return; | 377 | return; |
378 | 378 | ||
379 | cpu = raw_smp_processor_id(); | 379 | cpu = raw_smp_processor_id(); |
@@ -416,7 +416,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) | |||
416 | else | 416 | else |
417 | return; | 417 | return; |
418 | 418 | ||
419 | if (!tracer_enabled) | 419 | if (!tracer_enabled || !tracing_is_enabled()) |
420 | return; | 420 | return; |
421 | 421 | ||
422 | data = per_cpu_ptr(tr->trace_buffer.data, cpu); | 422 | data = per_cpu_ptr(tr->trace_buffer.data, cpu); |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9f46e98ba8f2..7ed6976493c8 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -35,12 +35,17 @@ struct trace_probe { | |||
35 | const char *symbol; /* symbol name */ | 35 | const char *symbol; /* symbol name */ |
36 | struct ftrace_event_class class; | 36 | struct ftrace_event_class class; |
37 | struct ftrace_event_call call; | 37 | struct ftrace_event_call call; |
38 | struct ftrace_event_file * __rcu *files; | 38 | struct list_head files; |
39 | ssize_t size; /* trace entry size */ | 39 | ssize_t size; /* trace entry size */ |
40 | unsigned int nr_args; | 40 | unsigned int nr_args; |
41 | struct probe_arg args[]; | 41 | struct probe_arg args[]; |
42 | }; | 42 | }; |
43 | 43 | ||
44 | struct event_file_link { | ||
45 | struct ftrace_event_file *file; | ||
46 | struct list_head list; | ||
47 | }; | ||
48 | |||
44 | #define SIZEOF_TRACE_PROBE(n) \ | 49 | #define SIZEOF_TRACE_PROBE(n) \ |
45 | (offsetof(struct trace_probe, args) + \ | 50 | (offsetof(struct trace_probe, args) + \ |
46 | (sizeof(struct probe_arg) * (n))) | 51 | (sizeof(struct probe_arg) * (n))) |
@@ -150,6 +155,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, | |||
150 | goto error; | 155 | goto error; |
151 | 156 | ||
152 | INIT_LIST_HEAD(&tp->list); | 157 | INIT_LIST_HEAD(&tp->list); |
158 | INIT_LIST_HEAD(&tp->files); | ||
153 | return tp; | 159 | return tp; |
154 | error: | 160 | error: |
155 | kfree(tp->call.name); | 161 | kfree(tp->call.name); |
@@ -183,25 +189,6 @@ static struct trace_probe *find_trace_probe(const char *event, | |||
183 | return NULL; | 189 | return NULL; |
184 | } | 190 | } |
185 | 191 | ||
186 | static int trace_probe_nr_files(struct trace_probe *tp) | ||
187 | { | ||
188 | struct ftrace_event_file **file; | ||
189 | int ret = 0; | ||
190 | |||
191 | /* | ||
192 | * Since all tp->files updater is protected by probe_enable_lock, | ||
193 | * we don't need to lock an rcu_read_lock. | ||
194 | */ | ||
195 | file = rcu_dereference_raw(tp->files); | ||
196 | if (file) | ||
197 | while (*(file++)) | ||
198 | ret++; | ||
199 | |||
200 | return ret; | ||
201 | } | ||
202 | |||
203 | static DEFINE_MUTEX(probe_enable_lock); | ||
204 | |||
205 | /* | 192 | /* |
206 | * Enable trace_probe | 193 | * Enable trace_probe |
207 | * if the file is NULL, enable "perf" handler, or enable "trace" handler. | 194 | * if the file is NULL, enable "perf" handler, or enable "trace" handler. |
@@ -211,67 +198,42 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) | |||
211 | { | 198 | { |
212 | int ret = 0; | 199 | int ret = 0; |
213 | 200 | ||
214 | mutex_lock(&probe_enable_lock); | ||
215 | |||
216 | if (file) { | 201 | if (file) { |
217 | struct ftrace_event_file **new, **old; | 202 | struct event_file_link *link; |
218 | int n = trace_probe_nr_files(tp); | 203 | |
219 | 204 | link = kmalloc(sizeof(*link), GFP_KERNEL); | |
220 | old = rcu_dereference_raw(tp->files); | 205 | if (!link) { |
221 | /* 1 is for new one and 1 is for stopper */ | ||
222 | new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *), | ||
223 | GFP_KERNEL); | ||
224 | if (!new) { | ||
225 | ret = -ENOMEM; | 206 | ret = -ENOMEM; |
226 | goto out_unlock; | 207 | goto out; |
227 | } | 208 | } |
228 | memcpy(new, old, n * sizeof(struct ftrace_event_file *)); | ||
229 | new[n] = file; | ||
230 | /* The last one keeps a NULL */ | ||
231 | 209 | ||
232 | rcu_assign_pointer(tp->files, new); | 210 | link->file = file; |
233 | tp->flags |= TP_FLAG_TRACE; | 211 | list_add_tail_rcu(&link->list, &tp->files); |
234 | 212 | ||
235 | if (old) { | 213 | tp->flags |= TP_FLAG_TRACE; |
236 | /* Make sure the probe is done with old files */ | ||
237 | synchronize_sched(); | ||
238 | kfree(old); | ||
239 | } | ||
240 | } else | 214 | } else |
241 | tp->flags |= TP_FLAG_PROFILE; | 215 | tp->flags |= TP_FLAG_PROFILE; |
242 | 216 | ||
243 | if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && | 217 | if (trace_probe_is_registered(tp) && !trace_probe_has_gone(tp)) { |
244 | !trace_probe_has_gone(tp)) { | ||
245 | if (trace_probe_is_return(tp)) | 218 | if (trace_probe_is_return(tp)) |
246 | ret = enable_kretprobe(&tp->rp); | 219 | ret = enable_kretprobe(&tp->rp); |
247 | else | 220 | else |
248 | ret = enable_kprobe(&tp->rp.kp); | 221 | ret = enable_kprobe(&tp->rp.kp); |
249 | } | 222 | } |
250 | 223 | out: | |
251 | out_unlock: | ||
252 | mutex_unlock(&probe_enable_lock); | ||
253 | |||
254 | return ret; | 224 | return ret; |
255 | } | 225 | } |
256 | 226 | ||
257 | static int | 227 | static struct event_file_link * |
258 | trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file) | 228 | find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) |
259 | { | 229 | { |
260 | struct ftrace_event_file **files; | 230 | struct event_file_link *link; |
261 | int i; | ||
262 | 231 | ||
263 | /* | 232 | list_for_each_entry(link, &tp->files, list) |
264 | * Since all tp->files updater is protected by probe_enable_lock, | 233 | if (link->file == file) |
265 | * we don't need to lock an rcu_read_lock. | 234 | return link; |
266 | */ | ||
267 | files = rcu_dereference_raw(tp->files); | ||
268 | if (files) { | ||
269 | for (i = 0; files[i]; i++) | ||
270 | if (files[i] == file) | ||
271 | return i; | ||
272 | } | ||
273 | 235 | ||
274 | return -1; | 236 | return NULL; |
275 | } | 237 | } |
276 | 238 | ||
277 | /* | 239 | /* |
@@ -283,41 +245,24 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) | |||
283 | { | 245 | { |
284 | int ret = 0; | 246 | int ret = 0; |
285 | 247 | ||
286 | mutex_lock(&probe_enable_lock); | ||
287 | |||
288 | if (file) { | 248 | if (file) { |
289 | struct ftrace_event_file **new, **old; | 249 | struct event_file_link *link; |
290 | int n = trace_probe_nr_files(tp); | ||
291 | int i, j; | ||
292 | 250 | ||
293 | old = rcu_dereference_raw(tp->files); | 251 | link = find_event_file_link(tp, file); |
294 | if (n == 0 || trace_probe_file_index(tp, file) < 0) { | 252 | if (!link) { |
295 | ret = -EINVAL; | 253 | ret = -EINVAL; |
296 | goto out_unlock; | 254 | goto out; |
297 | } | 255 | } |
298 | 256 | ||
299 | if (n == 1) { /* Remove the last file */ | 257 | list_del_rcu(&link->list); |
300 | tp->flags &= ~TP_FLAG_TRACE; | 258 | /* synchronize with kprobe_trace_func/kretprobe_trace_func */ |
301 | new = NULL; | 259 | synchronize_sched(); |
302 | } else { | 260 | kfree(link); |
303 | new = kzalloc(n * sizeof(struct ftrace_event_file *), | ||
304 | GFP_KERNEL); | ||
305 | if (!new) { | ||
306 | ret = -ENOMEM; | ||
307 | goto out_unlock; | ||
308 | } | ||
309 | |||
310 | /* This copy & check loop copies the NULL stopper too */ | ||
311 | for (i = 0, j = 0; j < n && i < n + 1; i++) | ||
312 | if (old[i] != file) | ||
313 | new[j++] = old[i]; | ||
314 | } | ||
315 | 261 | ||
316 | rcu_assign_pointer(tp->files, new); | 262 | if (!list_empty(&tp->files)) |
263 | goto out; | ||
317 | 264 | ||
318 | /* Make sure the probe is done with old files */ | 265 | tp->flags &= ~TP_FLAG_TRACE; |
319 | synchronize_sched(); | ||
320 | kfree(old); | ||
321 | } else | 266 | } else |
322 | tp->flags &= ~TP_FLAG_PROFILE; | 267 | tp->flags &= ~TP_FLAG_PROFILE; |
323 | 268 | ||
@@ -327,10 +272,7 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) | |||
327 | else | 272 | else |
328 | disable_kprobe(&tp->rp.kp); | 273 | disable_kprobe(&tp->rp.kp); |
329 | } | 274 | } |
330 | 275 | out: | |
331 | out_unlock: | ||
332 | mutex_unlock(&probe_enable_lock); | ||
333 | |||
334 | return ret; | 276 | return ret; |
335 | } | 277 | } |
336 | 278 | ||
@@ -885,20 +827,10 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, | |||
885 | static __kprobes void | 827 | static __kprobes void |
886 | kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) | 828 | kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) |
887 | { | 829 | { |
888 | /* | 830 | struct event_file_link *link; |
889 | * Note: preempt is already disabled around the kprobe handler. | ||
890 | * However, we still need an smp_read_barrier_depends() corresponding | ||
891 | * to smp_wmb() in rcu_assign_pointer() to access the pointer. | ||
892 | */ | ||
893 | struct ftrace_event_file **file = rcu_dereference_raw(tp->files); | ||
894 | |||
895 | if (unlikely(!file)) | ||
896 | return; | ||
897 | 831 | ||
898 | while (*file) { | 832 | list_for_each_entry_rcu(link, &tp->files, list) |
899 | __kprobe_trace_func(tp, regs, *file); | 833 | __kprobe_trace_func(tp, regs, link->file); |
900 | file++; | ||
901 | } | ||
902 | } | 834 | } |
903 | 835 | ||
904 | /* Kretprobe handler */ | 836 | /* Kretprobe handler */ |
@@ -945,20 +877,10 @@ static __kprobes void | |||
945 | kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, | 877 | kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, |
946 | struct pt_regs *regs) | 878 | struct pt_regs *regs) |
947 | { | 879 | { |
948 | /* | 880 | struct event_file_link *link; |
949 | * Note: preempt is already disabled around the kprobe handler. | ||
950 | * However, we still need an smp_read_barrier_depends() corresponding | ||
951 | * to smp_wmb() in rcu_assign_pointer() to access the pointer. | ||
952 | */ | ||
953 | struct ftrace_event_file **file = rcu_dereference_raw(tp->files); | ||
954 | 881 | ||
955 | if (unlikely(!file)) | 882 | list_for_each_entry_rcu(link, &tp->files, list) |
956 | return; | 883 | __kretprobe_trace_func(tp, ri, regs, link->file); |
957 | |||
958 | while (*file) { | ||
959 | __kretprobe_trace_func(tp, ri, regs, *file); | ||
960 | file++; | ||
961 | } | ||
962 | } | 884 | } |
963 | 885 | ||
964 | /* Event entry printers */ | 886 | /* Event entry printers */ |
@@ -1157,6 +1079,10 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) | |||
1157 | int size, __size, dsize; | 1079 | int size, __size, dsize; |
1158 | int rctx; | 1080 | int rctx; |
1159 | 1081 | ||
1082 | head = this_cpu_ptr(call->perf_events); | ||
1083 | if (hlist_empty(head)) | ||
1084 | return; | ||
1085 | |||
1160 | dsize = __get_data_size(tp, regs); | 1086 | dsize = __get_data_size(tp, regs); |
1161 | __size = sizeof(*entry) + tp->size + dsize; | 1087 | __size = sizeof(*entry) + tp->size + dsize; |
1162 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1088 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
@@ -1172,10 +1098,7 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) | |||
1172 | entry->ip = (unsigned long)tp->rp.kp.addr; | 1098 | entry->ip = (unsigned long)tp->rp.kp.addr; |
1173 | memset(&entry[1], 0, dsize); | 1099 | memset(&entry[1], 0, dsize); |
1174 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 1100 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
1175 | 1101 | perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); | |
1176 | head = this_cpu_ptr(call->perf_events); | ||
1177 | perf_trace_buf_submit(entry, size, rctx, | ||
1178 | entry->ip, 1, regs, head, NULL); | ||
1179 | } | 1102 | } |
1180 | 1103 | ||
1181 | /* Kretprobe profile handler */ | 1104 | /* Kretprobe profile handler */ |
@@ -1189,6 +1112,10 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, | |||
1189 | int size, __size, dsize; | 1112 | int size, __size, dsize; |
1190 | int rctx; | 1113 | int rctx; |
1191 | 1114 | ||
1115 | head = this_cpu_ptr(call->perf_events); | ||
1116 | if (hlist_empty(head)) | ||
1117 | return; | ||
1118 | |||
1192 | dsize = __get_data_size(tp, regs); | 1119 | dsize = __get_data_size(tp, regs); |
1193 | __size = sizeof(*entry) + tp->size + dsize; | 1120 | __size = sizeof(*entry) + tp->size + dsize; |
1194 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1121 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
@@ -1204,13 +1131,16 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, | |||
1204 | entry->func = (unsigned long)tp->rp.kp.addr; | 1131 | entry->func = (unsigned long)tp->rp.kp.addr; |
1205 | entry->ret_ip = (unsigned long)ri->ret_addr; | 1132 | entry->ret_ip = (unsigned long)ri->ret_addr; |
1206 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 1133 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
1207 | 1134 | perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); | |
1208 | head = this_cpu_ptr(call->perf_events); | ||
1209 | perf_trace_buf_submit(entry, size, rctx, | ||
1210 | entry->ret_ip, 1, regs, head, NULL); | ||
1211 | } | 1135 | } |
1212 | #endif /* CONFIG_PERF_EVENTS */ | 1136 | #endif /* CONFIG_PERF_EVENTS */ |
1213 | 1137 | ||
1138 | /* | ||
1139 | * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex. | ||
1140 | * | ||
1141 | * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe | ||
1142 | * lockless, but we can't race with this __init function. | ||
1143 | */ | ||
1214 | static __kprobes | 1144 | static __kprobes |
1215 | int kprobe_register(struct ftrace_event_call *event, | 1145 | int kprobe_register(struct ftrace_event_call *event, |
1216 | enum trace_reg type, void *data) | 1146 | enum trace_reg type, void *data) |
@@ -1376,6 +1306,10 @@ find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr) | |||
1376 | return NULL; | 1306 | return NULL; |
1377 | } | 1307 | } |
1378 | 1308 | ||
1309 | /* | ||
1310 | * Nobody but us can call enable_trace_probe/disable_trace_probe at this | ||
1311 | * stage, we can do this lockless. | ||
1312 | */ | ||
1379 | static __init int kprobe_trace_self_tests_init(void) | 1313 | static __init int kprobe_trace_self_tests_init(void) |
1380 | { | 1314 | { |
1381 | int ret, warn = 0; | 1315 | int ret, warn = 0; |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 2901e3b88590..a7329b7902f8 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
@@ -640,13 +640,20 @@ out: | |||
640 | * Enable ftrace, sleep 1/10 second, and then read the trace | 640 | * Enable ftrace, sleep 1/10 second, and then read the trace |
641 | * buffer to see if all is in order. | 641 | * buffer to see if all is in order. |
642 | */ | 642 | */ |
643 | int | 643 | __init int |
644 | trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) | 644 | trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) |
645 | { | 645 | { |
646 | int save_ftrace_enabled = ftrace_enabled; | 646 | int save_ftrace_enabled = ftrace_enabled; |
647 | unsigned long count; | 647 | unsigned long count; |
648 | int ret; | 648 | int ret; |
649 | 649 | ||
650 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
651 | if (ftrace_filter_param) { | ||
652 | printk(KERN_CONT " ... kernel command line filter set: force PASS ... "); | ||
653 | return 0; | ||
654 | } | ||
655 | #endif | ||
656 | |||
650 | /* make sure msleep has been recorded */ | 657 | /* make sure msleep has been recorded */ |
651 | msleep(1); | 658 | msleep(1); |
652 | 659 | ||
@@ -727,13 +734,20 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) | |||
727 | * Pretty much the same than for the function tracer from which the selftest | 734 | * Pretty much the same than for the function tracer from which the selftest |
728 | * has been borrowed. | 735 | * has been borrowed. |
729 | */ | 736 | */ |
730 | int | 737 | __init int |
731 | trace_selftest_startup_function_graph(struct tracer *trace, | 738 | trace_selftest_startup_function_graph(struct tracer *trace, |
732 | struct trace_array *tr) | 739 | struct trace_array *tr) |
733 | { | 740 | { |
734 | int ret; | 741 | int ret; |
735 | unsigned long count; | 742 | unsigned long count; |
736 | 743 | ||
744 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
745 | if (ftrace_filter_param) { | ||
746 | printk(KERN_CONT " ... kernel command line filter set: force PASS ... "); | ||
747 | return 0; | ||
748 | } | ||
749 | #endif | ||
750 | |||
737 | /* | 751 | /* |
738 | * Simulate the init() callback but we attach a watchdog callback | 752 | * Simulate the init() callback but we attach a watchdog callback |
739 | * to detect and recover from possible hangs | 753 | * to detect and recover from possible hangs |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8f2ac73c7a5f..322e16461072 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -306,6 +306,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) | |||
306 | struct syscall_metadata *sys_data; | 306 | struct syscall_metadata *sys_data; |
307 | struct ring_buffer_event *event; | 307 | struct ring_buffer_event *event; |
308 | struct ring_buffer *buffer; | 308 | struct ring_buffer *buffer; |
309 | unsigned long irq_flags; | ||
310 | int pc; | ||
309 | int syscall_nr; | 311 | int syscall_nr; |
310 | int size; | 312 | int size; |
311 | 313 | ||
@@ -321,9 +323,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) | |||
321 | 323 | ||
322 | size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; | 324 | size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; |
323 | 325 | ||
326 | local_save_flags(irq_flags); | ||
327 | pc = preempt_count(); | ||
328 | |||
324 | buffer = tr->trace_buffer.buffer; | 329 | buffer = tr->trace_buffer.buffer; |
325 | event = trace_buffer_lock_reserve(buffer, | 330 | event = trace_buffer_lock_reserve(buffer, |
326 | sys_data->enter_event->event.type, size, 0, 0); | 331 | sys_data->enter_event->event.type, size, irq_flags, pc); |
327 | if (!event) | 332 | if (!event) |
328 | return; | 333 | return; |
329 | 334 | ||
@@ -333,7 +338,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) | |||
333 | 338 | ||
334 | if (!filter_current_check_discard(buffer, sys_data->enter_event, | 339 | if (!filter_current_check_discard(buffer, sys_data->enter_event, |
335 | entry, event)) | 340 | entry, event)) |
336 | trace_current_buffer_unlock_commit(buffer, event, 0, 0); | 341 | trace_current_buffer_unlock_commit(buffer, event, |
342 | irq_flags, pc); | ||
337 | } | 343 | } |
338 | 344 | ||
339 | static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) | 345 | static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) |
@@ -343,6 +349,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) | |||
343 | struct syscall_metadata *sys_data; | 349 | struct syscall_metadata *sys_data; |
344 | struct ring_buffer_event *event; | 350 | struct ring_buffer_event *event; |
345 | struct ring_buffer *buffer; | 351 | struct ring_buffer *buffer; |
352 | unsigned long irq_flags; | ||
353 | int pc; | ||
346 | int syscall_nr; | 354 | int syscall_nr; |
347 | 355 | ||
348 | syscall_nr = trace_get_syscall_nr(current, regs); | 356 | syscall_nr = trace_get_syscall_nr(current, regs); |
@@ -355,9 +363,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) | |||
355 | if (!sys_data) | 363 | if (!sys_data) |
356 | return; | 364 | return; |
357 | 365 | ||
366 | local_save_flags(irq_flags); | ||
367 | pc = preempt_count(); | ||
368 | |||
358 | buffer = tr->trace_buffer.buffer; | 369 | buffer = tr->trace_buffer.buffer; |
359 | event = trace_buffer_lock_reserve(buffer, | 370 | event = trace_buffer_lock_reserve(buffer, |
360 | sys_data->exit_event->event.type, sizeof(*entry), 0, 0); | 371 | sys_data->exit_event->event.type, sizeof(*entry), |
372 | irq_flags, pc); | ||
361 | if (!event) | 373 | if (!event) |
362 | return; | 374 | return; |
363 | 375 | ||
@@ -367,7 +379,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) | |||
367 | 379 | ||
368 | if (!filter_current_check_discard(buffer, sys_data->exit_event, | 380 | if (!filter_current_check_discard(buffer, sys_data->exit_event, |
369 | entry, event)) | 381 | entry, event)) |
370 | trace_current_buffer_unlock_commit(buffer, event, 0, 0); | 382 | trace_current_buffer_unlock_commit(buffer, event, |
383 | irq_flags, pc); | ||
371 | } | 384 | } |
372 | 385 | ||
373 | static int reg_event_syscall_enter(struct ftrace_event_file *file, | 386 | static int reg_event_syscall_enter(struct ftrace_event_file *file, |
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 32494fb0ee64..d5d0cd368a56 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
@@ -283,8 +283,10 @@ static int create_trace_uprobe(int argc, char **argv) | |||
283 | return -EINVAL; | 283 | return -EINVAL; |
284 | } | 284 | } |
285 | arg = strchr(argv[1], ':'); | 285 | arg = strchr(argv[1], ':'); |
286 | if (!arg) | 286 | if (!arg) { |
287 | ret = -EINVAL; | ||
287 | goto fail_address_parse; | 288 | goto fail_address_parse; |
289 | } | ||
288 | 290 | ||
289 | *arg++ = '\0'; | 291 | *arg++ = '\0'; |
290 | filename = argv[1]; | 292 | filename = argv[1]; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ee8e29a2320c..f02c4a4a0c3c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -272,6 +272,15 @@ static cpumask_var_t *wq_numa_possible_cpumask; | |||
272 | static bool wq_disable_numa; | 272 | static bool wq_disable_numa; |
273 | module_param_named(disable_numa, wq_disable_numa, bool, 0444); | 273 | module_param_named(disable_numa, wq_disable_numa, bool, 0444); |
274 | 274 | ||
275 | /* see the comment above the definition of WQ_POWER_EFFICIENT */ | ||
276 | #ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT | ||
277 | static bool wq_power_efficient = true; | ||
278 | #else | ||
279 | static bool wq_power_efficient; | ||
280 | #endif | ||
281 | |||
282 | module_param_named(power_efficient, wq_power_efficient, bool, 0444); | ||
283 | |||
275 | static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ | 284 | static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ |
276 | 285 | ||
277 | /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ | 286 | /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ |
@@ -305,6 +314,10 @@ struct workqueue_struct *system_unbound_wq __read_mostly; | |||
305 | EXPORT_SYMBOL_GPL(system_unbound_wq); | 314 | EXPORT_SYMBOL_GPL(system_unbound_wq); |
306 | struct workqueue_struct *system_freezable_wq __read_mostly; | 315 | struct workqueue_struct *system_freezable_wq __read_mostly; |
307 | EXPORT_SYMBOL_GPL(system_freezable_wq); | 316 | EXPORT_SYMBOL_GPL(system_freezable_wq); |
317 | struct workqueue_struct *system_power_efficient_wq __read_mostly; | ||
318 | EXPORT_SYMBOL_GPL(system_power_efficient_wq); | ||
319 | struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly; | ||
320 | EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); | ||
308 | 321 | ||
309 | static int worker_thread(void *__worker); | 322 | static int worker_thread(void *__worker); |
310 | static void copy_workqueue_attrs(struct workqueue_attrs *to, | 323 | static void copy_workqueue_attrs(struct workqueue_attrs *to, |
@@ -4086,6 +4099,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | |||
4086 | struct workqueue_struct *wq; | 4099 | struct workqueue_struct *wq; |
4087 | struct pool_workqueue *pwq; | 4100 | struct pool_workqueue *pwq; |
4088 | 4101 | ||
4102 | /* see the comment above the definition of WQ_POWER_EFFICIENT */ | ||
4103 | if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) | ||
4104 | flags |= WQ_UNBOUND; | ||
4105 | |||
4089 | /* allocate wq and format name */ | 4106 | /* allocate wq and format name */ |
4090 | if (flags & WQ_UNBOUND) | 4107 | if (flags & WQ_UNBOUND) |
4091 | tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); | 4108 | tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); |
@@ -4985,8 +5002,15 @@ static int __init init_workqueues(void) | |||
4985 | WQ_UNBOUND_MAX_ACTIVE); | 5002 | WQ_UNBOUND_MAX_ACTIVE); |
4986 | system_freezable_wq = alloc_workqueue("events_freezable", | 5003 | system_freezable_wq = alloc_workqueue("events_freezable", |
4987 | WQ_FREEZABLE, 0); | 5004 | WQ_FREEZABLE, 0); |
5005 | system_power_efficient_wq = alloc_workqueue("events_power_efficient", | ||
5006 | WQ_POWER_EFFICIENT, 0); | ||
5007 | system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient", | ||
5008 | WQ_FREEZABLE | WQ_POWER_EFFICIENT, | ||
5009 | 0); | ||
4988 | BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || | 5010 | BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || |
4989 | !system_unbound_wq || !system_freezable_wq); | 5011 | !system_unbound_wq || !system_freezable_wq || |
5012 | !system_power_efficient_wq || | ||
5013 | !system_freezable_power_efficient_wq); | ||
4990 | return 0; | 5014 | return 0; |
4991 | } | 5015 | } |
4992 | early_initcall(init_workqueues); | 5016 | early_initcall(init_workqueues); |