aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks6
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/audit_tree.c1
-rw-r--r--kernel/cgroup.c1536
-rw-r--r--kernel/context_tracking.c41
-rw-r--r--kernel/cpu.c55
-rw-r--r--kernel/cpu/idle.c17
-rw-r--r--kernel/cpuset.c482
-rw-r--r--kernel/events/core.c511
-rw-r--r--kernel/events/hw_breakpoint.c193
-rw-r--r--kernel/events/internal.h4
-rw-r--r--kernel/exit.c15
-rw-r--r--kernel/fork.c66
-rw-r--r--kernel/freezer.c12
-rw-r--r--kernel/futex.c6
-rw-r--r--kernel/hrtimer.c3
-rw-r--r--kernel/irq/chip.c13
-rw-r--r--kernel/irq/generic-chip.c314
-rw-r--r--kernel/irq/irqdomain.c587
-rw-r--r--kernel/irq/manage.c17
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/kmod.c11
-rw-r--r--kernel/kprobes.c33
-rw-r--r--kernel/lockdep.c17
-rw-r--r--kernel/mutex.c384
-rw-r--r--kernel/pid.c14
-rw-r--r--kernel/power/Kconfig21
-rw-r--r--kernel/power/main.c6
-rw-r--r--kernel/power/process.c26
-rw-r--r--kernel/power/qos.c14
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/suspend.c2
-rw-r--r--kernel/printk.c91
-rw-r--r--kernel/ptrace.c64
-rw-r--r--kernel/range.c21
-rw-r--r--kernel/rcupdate.c29
-rw-r--r--kernel/rcutiny.c21
-rw-r--r--kernel/rcutiny_plugin.h1009
-rw-r--r--kernel/rcutorture.c39
-rw-r--r--kernel/rcutree.c191
-rw-r--r--kernel/rcutree.h17
-rw-r--r--kernel/rcutree_plugin.h81
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/rtmutex.c13
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c3
-rw-r--r--kernel/sched/core.c660
-rw-r--r--kernel/sched/cputime.c11
-rw-r--r--kernel/sched/debug.c37
-rw-r--r--kernel/sched/fair.c175
-rw-r--r--kernel/sched/proc.c591
-rw-r--r--kernel/sched/rt.c132
-rw-r--r--kernel/sched/sched.h71
-rw-r--r--kernel/sched/stats.h8
-rw-r--r--kernel/sched/stop_task.c8
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/softirq.c23
-rw-r--r--kernel/sys.c47
-rw-r--r--kernel/sysctl.c12
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/time/tick-broadcast.c11
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/trace/trace.c8
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/wait.c88
-rw-r--r--kernel/workqueue.c26
-rw-r--r--kernel/workqueue_internal.h2
67 files changed, 4066 insertions, 3855 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 44511d100eaa..d2b32ac27a39 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -138,7 +138,7 @@ config INLINE_SPIN_UNLOCK_BH
138 138
139config INLINE_SPIN_UNLOCK_IRQ 139config INLINE_SPIN_UNLOCK_IRQ
140 def_bool y 140 def_bool y
141 depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH 141 depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_IRQ
142 142
143config INLINE_SPIN_UNLOCK_IRQRESTORE 143config INLINE_SPIN_UNLOCK_IRQRESTORE
144 def_bool y 144 def_bool y
@@ -175,7 +175,7 @@ config INLINE_READ_UNLOCK_BH
175 175
176config INLINE_READ_UNLOCK_IRQ 176config INLINE_READ_UNLOCK_IRQ
177 def_bool y 177 def_bool y
178 depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH 178 depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_IRQ
179 179
180config INLINE_READ_UNLOCK_IRQRESTORE 180config INLINE_READ_UNLOCK_IRQRESTORE
181 def_bool y 181 def_bool y
@@ -212,7 +212,7 @@ config INLINE_WRITE_UNLOCK_BH
212 212
213config INLINE_WRITE_UNLOCK_IRQ 213config INLINE_WRITE_UNLOCK_IRQ
214 def_bool y 214 def_bool y
215 depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH 215 depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_IRQ
216 216
217config INLINE_WRITE_UNLOCK_IRQRESTORE 217config INLINE_WRITE_UNLOCK_IRQRESTORE
218 def_bool y 218 def_bool y
diff --git a/kernel/audit.c b/kernel/audit.c
index 21c7fa615bd3..91e53d04b6a9 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1056,7 +1056,7 @@ static inline void audit_get_stamp(struct audit_context *ctx,
1056static void wait_for_auditd(unsigned long sleep_time) 1056static void wait_for_auditd(unsigned long sleep_time)
1057{ 1057{
1058 DECLARE_WAITQUEUE(wait, current); 1058 DECLARE_WAITQUEUE(wait, current);
1059 set_current_state(TASK_INTERRUPTIBLE); 1059 set_current_state(TASK_UNINTERRUPTIBLE);
1060 add_wait_queue(&audit_backlog_wait, &wait); 1060 add_wait_queue(&audit_backlog_wait, &wait);
1061 1061
1062 if (audit_backlog_limit && 1062 if (audit_backlog_limit &&
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index a291aa23fb3f..43c307dc9453 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -658,6 +658,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
658 struct vfsmount *mnt; 658 struct vfsmount *mnt;
659 int err; 659 int err;
660 660
661 rule->tree = NULL;
661 list_for_each_entry(tree, &tree_list, list) { 662 list_for_each_entry(tree, &tree_list, list) {
662 if (!strcmp(seed->pathname, tree->pathname)) { 663 if (!strcmp(seed->pathname, tree->pathname)) {
663 put_tree(seed); 664 put_tree(seed);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a7c9e6ddb979..e5583d10a325 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,9 +63,6 @@
63 63
64#include <linux/atomic.h> 64#include <linux/atomic.h>
65 65
66/* css deactivation bias, makes css->refcnt negative to deny new trygets */
67#define CSS_DEACT_BIAS INT_MIN
68
69/* 66/*
70 * cgroup_mutex is the master lock. Any modification to cgroup or its 67 * cgroup_mutex is the master lock. Any modification to cgroup or its
71 * hierarchy must be performed while holding it. 68 * hierarchy must be performed while holding it.
@@ -99,16 +96,19 @@ static DEFINE_MUTEX(cgroup_root_mutex);
99 */ 96 */
100#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, 97#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
101#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) 98#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
102static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { 99static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = {
103#include <linux/cgroup_subsys.h> 100#include <linux/cgroup_subsys.h>
104}; 101};
105 102
106/* 103/*
107 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the 104 * The dummy hierarchy, reserved for the subsystems that are otherwise
108 * subsystems that are otherwise unattached - it never has more than a 105 * unattached - it never has more than a single cgroup, and all tasks are
109 * single cgroup, and all tasks are part of that cgroup. 106 * part of that cgroup.
110 */ 107 */
111static struct cgroupfs_root rootnode; 108static struct cgroupfs_root cgroup_dummy_root;
109
110/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
111static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
112 112
113/* 113/*
114 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. 114 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
@@ -186,18 +186,28 @@ struct cgroup_event {
186 186
187/* The list of hierarchy roots */ 187/* The list of hierarchy roots */
188 188
189static LIST_HEAD(roots); 189static LIST_HEAD(cgroup_roots);
190static int root_count; 190static int cgroup_root_count;
191 191
192static DEFINE_IDA(hierarchy_ida); 192/*
193static int next_hierarchy_id; 193 * Hierarchy ID allocation and mapping. It follows the same exclusion
194static DEFINE_SPINLOCK(hierarchy_id_lock); 194 * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
195 195 * writes, either for reads.
196/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 196 */
197#define dummytop (&rootnode.top_cgroup) 197static DEFINE_IDR(cgroup_hierarchy_idr);
198 198
199static struct cgroup_name root_cgroup_name = { .name = "/" }; 199static struct cgroup_name root_cgroup_name = { .name = "/" };
200 200
201/*
202 * Assign a monotonically increasing serial number to cgroups. It
203 * guarantees cgroups with bigger numbers are newer than those with smaller
204 * numbers. Also, as cgroups are always appended to the parent's
205 * ->children list, it guarantees that sibling cgroups are always sorted in
206 * the ascending serial number order on the list. Protected by
207 * cgroup_mutex.
208 */
209static u64 cgroup_serial_nr_next = 1;
210
201/* This flag indicates whether tasks in the fork and exit paths should 211/* This flag indicates whether tasks in the fork and exit paths should
202 * check for fork/exit handlers to call. This avoids us having to do 212 * check for fork/exit handlers to call. This avoids us having to do
203 * extra work in the fork/exit path if none of the subsystems need to 213 * extra work in the fork/exit path if none of the subsystems need to
@@ -205,27 +215,15 @@ static struct cgroup_name root_cgroup_name = { .name = "/" };
205 */ 215 */
206static int need_forkexit_callback __read_mostly; 216static int need_forkexit_callback __read_mostly;
207 217
218static void cgroup_offline_fn(struct work_struct *work);
208static int cgroup_destroy_locked(struct cgroup *cgrp); 219static int cgroup_destroy_locked(struct cgroup *cgrp);
209static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 220static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
210 struct cftype cfts[], bool is_add); 221 struct cftype cfts[], bool is_add);
211 222
212static int css_unbias_refcnt(int refcnt)
213{
214 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
215}
216
217/* the current nr of refs, always >= 0 whether @css is deactivated or not */
218static int css_refcnt(struct cgroup_subsys_state *css)
219{
220 int v = atomic_read(&css->refcnt);
221
222 return css_unbias_refcnt(v);
223}
224
225/* convenient tests for these bits */ 223/* convenient tests for these bits */
226inline int cgroup_is_removed(const struct cgroup *cgrp) 224static inline bool cgroup_is_dead(const struct cgroup *cgrp)
227{ 225{
228 return test_bit(CGRP_REMOVED, &cgrp->flags); 226 return test_bit(CGRP_DEAD, &cgrp->flags);
229} 227}
230 228
231/** 229/**
@@ -261,16 +259,38 @@ static int notify_on_release(const struct cgroup *cgrp)
261 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 259 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
262} 260}
263 261
264/* 262/**
265 * for_each_subsys() allows you to iterate on each subsystem attached to 263 * for_each_subsys - iterate all loaded cgroup subsystems
266 * an active hierarchy 264 * @ss: the iteration cursor
265 * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
266 *
267 * Should be called under cgroup_mutex.
267 */ 268 */
268#define for_each_subsys(_root, _ss) \ 269#define for_each_subsys(ss, i) \
269list_for_each_entry(_ss, &_root->subsys_list, sibling) 270 for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \
271 if (({ lockdep_assert_held(&cgroup_mutex); \
272 !((ss) = cgroup_subsys[i]); })) { } \
273 else
274
275/**
276 * for_each_builtin_subsys - iterate all built-in cgroup subsystems
277 * @ss: the iteration cursor
278 * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
279 *
280 * Bulit-in subsystems are always present and iteration itself doesn't
281 * require any synchronization.
282 */
283#define for_each_builtin_subsys(ss, i) \
284 for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
285 (((ss) = cgroup_subsys[i]) || true); (i)++)
286
287/* iterate each subsystem attached to a hierarchy */
288#define for_each_root_subsys(root, ss) \
289 list_for_each_entry((ss), &(root)->subsys_list, sibling)
270 290
271/* for_each_active_root() allows you to iterate across the active hierarchies */ 291/* iterate across the active hierarchies */
272#define for_each_active_root(_root) \ 292#define for_each_active_root(root) \
273list_for_each_entry(_root, &roots, root_list) 293 list_for_each_entry((root), &cgroup_roots, root_list)
274 294
275static inline struct cgroup *__d_cgrp(struct dentry *dentry) 295static inline struct cgroup *__d_cgrp(struct dentry *dentry)
276{ 296{
@@ -297,7 +317,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
297static bool cgroup_lock_live_group(struct cgroup *cgrp) 317static bool cgroup_lock_live_group(struct cgroup *cgrp)
298{ 318{
299 mutex_lock(&cgroup_mutex); 319 mutex_lock(&cgroup_mutex);
300 if (cgroup_is_removed(cgrp)) { 320 if (cgroup_is_dead(cgrp)) {
301 mutex_unlock(&cgroup_mutex); 321 mutex_unlock(&cgroup_mutex);
302 return false; 322 return false;
303 } 323 }
@@ -312,20 +332,24 @@ static void cgroup_release_agent(struct work_struct *work);
312static DECLARE_WORK(release_agent_work, cgroup_release_agent); 332static DECLARE_WORK(release_agent_work, cgroup_release_agent);
313static void check_for_release(struct cgroup *cgrp); 333static void check_for_release(struct cgroup *cgrp);
314 334
315/* Link structure for associating css_set objects with cgroups */ 335/*
316struct cg_cgroup_link { 336 * A cgroup can be associated with multiple css_sets as different tasks may
317 /* 337 * belong to different cgroups on different hierarchies. In the other
318 * List running through cg_cgroup_links associated with a 338 * direction, a css_set is naturally associated with multiple cgroups.
319 * cgroup, anchored on cgroup->css_sets 339 * This M:N relationship is represented by the following link structure
320 */ 340 * which exists for each association and allows traversing the associations
321 struct list_head cgrp_link_list; 341 * from both sides.
322 struct cgroup *cgrp; 342 */
323 /* 343struct cgrp_cset_link {
324 * List running through cg_cgroup_links pointing at a 344 /* the cgroup and css_set this link associates */
325 * single css_set object, anchored on css_set->cg_links 345 struct cgroup *cgrp;
326 */ 346 struct css_set *cset;
327 struct list_head cg_link_list; 347
328 struct css_set *cg; 348 /* list of cgrp_cset_links anchored at cgrp->cset_links */
349 struct list_head cset_link;
350
351 /* list of cgrp_cset_links anchored at css_set->cgrp_links */
352 struct list_head cgrp_link;
329}; 353};
330 354
331/* The default css_set - used by init and its children prior to any 355/* The default css_set - used by init and its children prior to any
@@ -336,7 +360,7 @@ struct cg_cgroup_link {
336 */ 360 */
337 361
338static struct css_set init_css_set; 362static struct css_set init_css_set;
339static struct cg_cgroup_link init_css_set_link; 363static struct cgrp_cset_link init_cgrp_cset_link;
340 364
341static int cgroup_init_idr(struct cgroup_subsys *ss, 365static int cgroup_init_idr(struct cgroup_subsys *ss,
342 struct cgroup_subsys_state *css); 366 struct cgroup_subsys_state *css);
@@ -357,10 +381,11 @@ static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
357 381
358static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) 382static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
359{ 383{
360 int i;
361 unsigned long key = 0UL; 384 unsigned long key = 0UL;
385 struct cgroup_subsys *ss;
386 int i;
362 387
363 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) 388 for_each_subsys(ss, i)
364 key += (unsigned long)css[i]; 389 key += (unsigned long)css[i];
365 key = (key >> 16) ^ key; 390 key = (key >> 16) ^ key;
366 391
@@ -373,90 +398,83 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
373 * compiled into their kernel but not actually in use */ 398 * compiled into their kernel but not actually in use */
374static int use_task_css_set_links __read_mostly; 399static int use_task_css_set_links __read_mostly;
375 400
376static void __put_css_set(struct css_set *cg, int taskexit) 401static void __put_css_set(struct css_set *cset, int taskexit)
377{ 402{
378 struct cg_cgroup_link *link; 403 struct cgrp_cset_link *link, *tmp_link;
379 struct cg_cgroup_link *saved_link; 404
380 /* 405 /*
381 * Ensure that the refcount doesn't hit zero while any readers 406 * Ensure that the refcount doesn't hit zero while any readers
382 * can see it. Similar to atomic_dec_and_lock(), but for an 407 * can see it. Similar to atomic_dec_and_lock(), but for an
383 * rwlock 408 * rwlock
384 */ 409 */
385 if (atomic_add_unless(&cg->refcount, -1, 1)) 410 if (atomic_add_unless(&cset->refcount, -1, 1))
386 return; 411 return;
387 write_lock(&css_set_lock); 412 write_lock(&css_set_lock);
388 if (!atomic_dec_and_test(&cg->refcount)) { 413 if (!atomic_dec_and_test(&cset->refcount)) {
389 write_unlock(&css_set_lock); 414 write_unlock(&css_set_lock);
390 return; 415 return;
391 } 416 }
392 417
393 /* This css_set is dead. unlink it and release cgroup refcounts */ 418 /* This css_set is dead. unlink it and release cgroup refcounts */
394 hash_del(&cg->hlist); 419 hash_del(&cset->hlist);
395 css_set_count--; 420 css_set_count--;
396 421
397 list_for_each_entry_safe(link, saved_link, &cg->cg_links, 422 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
398 cg_link_list) {
399 struct cgroup *cgrp = link->cgrp; 423 struct cgroup *cgrp = link->cgrp;
400 list_del(&link->cg_link_list);
401 list_del(&link->cgrp_link_list);
402 424
403 /* 425 list_del(&link->cset_link);
404 * We may not be holding cgroup_mutex, and if cgrp->count is 426 list_del(&link->cgrp_link);
405 * dropped to 0 the cgroup can be destroyed at any time, hence 427
406 * rcu_read_lock is used to keep it alive. 428 /* @cgrp can't go away while we're holding css_set_lock */
407 */ 429 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
408 rcu_read_lock();
409 if (atomic_dec_and_test(&cgrp->count) &&
410 notify_on_release(cgrp)) {
411 if (taskexit) 430 if (taskexit)
412 set_bit(CGRP_RELEASABLE, &cgrp->flags); 431 set_bit(CGRP_RELEASABLE, &cgrp->flags);
413 check_for_release(cgrp); 432 check_for_release(cgrp);
414 } 433 }
415 rcu_read_unlock();
416 434
417 kfree(link); 435 kfree(link);
418 } 436 }
419 437
420 write_unlock(&css_set_lock); 438 write_unlock(&css_set_lock);
421 kfree_rcu(cg, rcu_head); 439 kfree_rcu(cset, rcu_head);
422} 440}
423 441
424/* 442/*
425 * refcounted get/put for css_set objects 443 * refcounted get/put for css_set objects
426 */ 444 */
427static inline void get_css_set(struct css_set *cg) 445static inline void get_css_set(struct css_set *cset)
428{ 446{
429 atomic_inc(&cg->refcount); 447 atomic_inc(&cset->refcount);
430} 448}
431 449
432static inline void put_css_set(struct css_set *cg) 450static inline void put_css_set(struct css_set *cset)
433{ 451{
434 __put_css_set(cg, 0); 452 __put_css_set(cset, 0);
435} 453}
436 454
437static inline void put_css_set_taskexit(struct css_set *cg) 455static inline void put_css_set_taskexit(struct css_set *cset)
438{ 456{
439 __put_css_set(cg, 1); 457 __put_css_set(cset, 1);
440} 458}
441 459
442/* 460/**
443 * compare_css_sets - helper function for find_existing_css_set(). 461 * compare_css_sets - helper function for find_existing_css_set().
444 * @cg: candidate css_set being tested 462 * @cset: candidate css_set being tested
445 * @old_cg: existing css_set for a task 463 * @old_cset: existing css_set for a task
446 * @new_cgrp: cgroup that's being entered by the task 464 * @new_cgrp: cgroup that's being entered by the task
447 * @template: desired set of css pointers in css_set (pre-calculated) 465 * @template: desired set of css pointers in css_set (pre-calculated)
448 * 466 *
449 * Returns true if "cg" matches "old_cg" except for the hierarchy 467 * Returns true if "cg" matches "old_cg" except for the hierarchy
450 * which "new_cgrp" belongs to, for which it should match "new_cgrp". 468 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
451 */ 469 */
452static bool compare_css_sets(struct css_set *cg, 470static bool compare_css_sets(struct css_set *cset,
453 struct css_set *old_cg, 471 struct css_set *old_cset,
454 struct cgroup *new_cgrp, 472 struct cgroup *new_cgrp,
455 struct cgroup_subsys_state *template[]) 473 struct cgroup_subsys_state *template[])
456{ 474{
457 struct list_head *l1, *l2; 475 struct list_head *l1, *l2;
458 476
459 if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { 477 if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
460 /* Not all subsystems matched */ 478 /* Not all subsystems matched */
461 return false; 479 return false;
462 } 480 }
@@ -470,28 +488,28 @@ static bool compare_css_sets(struct css_set *cg,
470 * candidates. 488 * candidates.
471 */ 489 */
472 490
473 l1 = &cg->cg_links; 491 l1 = &cset->cgrp_links;
474 l2 = &old_cg->cg_links; 492 l2 = &old_cset->cgrp_links;
475 while (1) { 493 while (1) {
476 struct cg_cgroup_link *cgl1, *cgl2; 494 struct cgrp_cset_link *link1, *link2;
477 struct cgroup *cg1, *cg2; 495 struct cgroup *cgrp1, *cgrp2;
478 496
479 l1 = l1->next; 497 l1 = l1->next;
480 l2 = l2->next; 498 l2 = l2->next;
481 /* See if we reached the end - both lists are equal length. */ 499 /* See if we reached the end - both lists are equal length. */
482 if (l1 == &cg->cg_links) { 500 if (l1 == &cset->cgrp_links) {
483 BUG_ON(l2 != &old_cg->cg_links); 501 BUG_ON(l2 != &old_cset->cgrp_links);
484 break; 502 break;
485 } else { 503 } else {
486 BUG_ON(l2 == &old_cg->cg_links); 504 BUG_ON(l2 == &old_cset->cgrp_links);
487 } 505 }
488 /* Locate the cgroups associated with these links. */ 506 /* Locate the cgroups associated with these links. */
489 cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); 507 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
490 cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); 508 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
491 cg1 = cgl1->cgrp; 509 cgrp1 = link1->cgrp;
492 cg2 = cgl2->cgrp; 510 cgrp2 = link2->cgrp;
493 /* Hierarchies should be linked in the same order. */ 511 /* Hierarchies should be linked in the same order. */
494 BUG_ON(cg1->root != cg2->root); 512 BUG_ON(cgrp1->root != cgrp2->root);
495 513
496 /* 514 /*
497 * If this hierarchy is the hierarchy of the cgroup 515 * If this hierarchy is the hierarchy of the cgroup
@@ -500,46 +518,39 @@ static bool compare_css_sets(struct css_set *cg,
500 * hierarchy, then this css_set should point to the 518 * hierarchy, then this css_set should point to the
501 * same cgroup as the old css_set. 519 * same cgroup as the old css_set.
502 */ 520 */
503 if (cg1->root == new_cgrp->root) { 521 if (cgrp1->root == new_cgrp->root) {
504 if (cg1 != new_cgrp) 522 if (cgrp1 != new_cgrp)
505 return false; 523 return false;
506 } else { 524 } else {
507 if (cg1 != cg2) 525 if (cgrp1 != cgrp2)
508 return false; 526 return false;
509 } 527 }
510 } 528 }
511 return true; 529 return true;
512} 530}
513 531
514/* 532/**
515 * find_existing_css_set() is a helper for 533 * find_existing_css_set - init css array and find the matching css_set
516 * find_css_set(), and checks to see whether an existing 534 * @old_cset: the css_set that we're using before the cgroup transition
517 * css_set is suitable. 535 * @cgrp: the cgroup that we're moving into
518 * 536 * @template: out param for the new set of csses, should be clear on entry
519 * oldcg: the cgroup group that we're using before the cgroup
520 * transition
521 *
522 * cgrp: the cgroup that we're moving into
523 *
524 * template: location in which to build the desired set of subsystem
525 * state objects for the new cgroup group
526 */ 537 */
527static struct css_set *find_existing_css_set( 538static struct css_set *find_existing_css_set(struct css_set *old_cset,
528 struct css_set *oldcg, 539 struct cgroup *cgrp,
529 struct cgroup *cgrp, 540 struct cgroup_subsys_state *template[])
530 struct cgroup_subsys_state *template[])
531{ 541{
532 int i;
533 struct cgroupfs_root *root = cgrp->root; 542 struct cgroupfs_root *root = cgrp->root;
534 struct css_set *cg; 543 struct cgroup_subsys *ss;
544 struct css_set *cset;
535 unsigned long key; 545 unsigned long key;
546 int i;
536 547
537 /* 548 /*
538 * Build the set of subsystem state objects that we want to see in the 549 * Build the set of subsystem state objects that we want to see in the
539 * new css_set. while subsystems can change globally, the entries here 550 * new css_set. while subsystems can change globally, the entries here
540 * won't change, so no need for locking. 551 * won't change, so no need for locking.
541 */ 552 */
542 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 553 for_each_subsys(ss, i) {
543 if (root->subsys_mask & (1UL << i)) { 554 if (root->subsys_mask & (1UL << i)) {
544 /* Subsystem is in this hierarchy. So we want 555 /* Subsystem is in this hierarchy. So we want
545 * the subsystem state from the new 556 * the subsystem state from the new
@@ -548,148 +559,152 @@ static struct css_set *find_existing_css_set(
548 } else { 559 } else {
549 /* Subsystem is not in this hierarchy, so we 560 /* Subsystem is not in this hierarchy, so we
550 * don't want to change the subsystem state */ 561 * don't want to change the subsystem state */
551 template[i] = oldcg->subsys[i]; 562 template[i] = old_cset->subsys[i];
552 } 563 }
553 } 564 }
554 565
555 key = css_set_hash(template); 566 key = css_set_hash(template);
556 hash_for_each_possible(css_set_table, cg, hlist, key) { 567 hash_for_each_possible(css_set_table, cset, hlist, key) {
557 if (!compare_css_sets(cg, oldcg, cgrp, template)) 568 if (!compare_css_sets(cset, old_cset, cgrp, template))
558 continue; 569 continue;
559 570
560 /* This css_set matches what we need */ 571 /* This css_set matches what we need */
561 return cg; 572 return cset;
562 } 573 }
563 574
564 /* No existing cgroup group matched */ 575 /* No existing cgroup group matched */
565 return NULL; 576 return NULL;
566} 577}
567 578
568static void free_cg_links(struct list_head *tmp) 579static void free_cgrp_cset_links(struct list_head *links_to_free)
569{ 580{
570 struct cg_cgroup_link *link; 581 struct cgrp_cset_link *link, *tmp_link;
571 struct cg_cgroup_link *saved_link;
572 582
573 list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { 583 list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
574 list_del(&link->cgrp_link_list); 584 list_del(&link->cset_link);
575 kfree(link); 585 kfree(link);
576 } 586 }
577} 587}
578 588
579/* 589/**
580 * allocate_cg_links() allocates "count" cg_cgroup_link structures 590 * allocate_cgrp_cset_links - allocate cgrp_cset_links
581 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on 591 * @count: the number of links to allocate
582 * success or a negative error 592 * @tmp_links: list_head the allocated links are put on
593 *
594 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
595 * through ->cset_link. Returns 0 on success or -errno.
583 */ 596 */
584static int allocate_cg_links(int count, struct list_head *tmp) 597static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
585{ 598{
586 struct cg_cgroup_link *link; 599 struct cgrp_cset_link *link;
587 int i; 600 int i;
588 INIT_LIST_HEAD(tmp); 601
602 INIT_LIST_HEAD(tmp_links);
603
589 for (i = 0; i < count; i++) { 604 for (i = 0; i < count; i++) {
590 link = kmalloc(sizeof(*link), GFP_KERNEL); 605 link = kzalloc(sizeof(*link), GFP_KERNEL);
591 if (!link) { 606 if (!link) {
592 free_cg_links(tmp); 607 free_cgrp_cset_links(tmp_links);
593 return -ENOMEM; 608 return -ENOMEM;
594 } 609 }
595 list_add(&link->cgrp_link_list, tmp); 610 list_add(&link->cset_link, tmp_links);
596 } 611 }
597 return 0; 612 return 0;
598} 613}
599 614
600/** 615/**
601 * link_css_set - a helper function to link a css_set to a cgroup 616 * link_css_set - a helper function to link a css_set to a cgroup
602 * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() 617 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
603 * @cg: the css_set to be linked 618 * @cset: the css_set to be linked
604 * @cgrp: the destination cgroup 619 * @cgrp: the destination cgroup
605 */ 620 */
606static void link_css_set(struct list_head *tmp_cg_links, 621static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
607 struct css_set *cg, struct cgroup *cgrp) 622 struct cgroup *cgrp)
608{ 623{
609 struct cg_cgroup_link *link; 624 struct cgrp_cset_link *link;
610 625
611 BUG_ON(list_empty(tmp_cg_links)); 626 BUG_ON(list_empty(tmp_links));
612 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, 627 link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
613 cgrp_link_list); 628 link->cset = cset;
614 link->cg = cg;
615 link->cgrp = cgrp; 629 link->cgrp = cgrp;
616 atomic_inc(&cgrp->count); 630 list_move(&link->cset_link, &cgrp->cset_links);
617 list_move(&link->cgrp_link_list, &cgrp->css_sets);
618 /* 631 /*
619 * Always add links to the tail of the list so that the list 632 * Always add links to the tail of the list so that the list
620 * is sorted by order of hierarchy creation 633 * is sorted by order of hierarchy creation
621 */ 634 */
622 list_add_tail(&link->cg_link_list, &cg->cg_links); 635 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
623} 636}
624 637
625/* 638/**
626 * find_css_set() takes an existing cgroup group and a 639 * find_css_set - return a new css_set with one cgroup updated
627 * cgroup object, and returns a css_set object that's 640 * @old_cset: the baseline css_set
628 * equivalent to the old group, but with the given cgroup 641 * @cgrp: the cgroup to be updated
629 * substituted into the appropriate hierarchy. Must be called with 642 *
630 * cgroup_mutex held 643 * Return a new css_set that's equivalent to @old_cset, but with @cgrp
644 * substituted into the appropriate hierarchy.
631 */ 645 */
632static struct css_set *find_css_set( 646static struct css_set *find_css_set(struct css_set *old_cset,
633 struct css_set *oldcg, struct cgroup *cgrp) 647 struct cgroup *cgrp)
634{ 648{
635 struct css_set *res; 649 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
636 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 650 struct css_set *cset;
637 651 struct list_head tmp_links;
638 struct list_head tmp_cg_links; 652 struct cgrp_cset_link *link;
639
640 struct cg_cgroup_link *link;
641 unsigned long key; 653 unsigned long key;
642 654
655 lockdep_assert_held(&cgroup_mutex);
656
643 /* First see if we already have a cgroup group that matches 657 /* First see if we already have a cgroup group that matches
644 * the desired set */ 658 * the desired set */
645 read_lock(&css_set_lock); 659 read_lock(&css_set_lock);
646 res = find_existing_css_set(oldcg, cgrp, template); 660 cset = find_existing_css_set(old_cset, cgrp, template);
647 if (res) 661 if (cset)
648 get_css_set(res); 662 get_css_set(cset);
649 read_unlock(&css_set_lock); 663 read_unlock(&css_set_lock);
650 664
651 if (res) 665 if (cset)
652 return res; 666 return cset;
653 667
654 res = kmalloc(sizeof(*res), GFP_KERNEL); 668 cset = kzalloc(sizeof(*cset), GFP_KERNEL);
655 if (!res) 669 if (!cset)
656 return NULL; 670 return NULL;
657 671
658 /* Allocate all the cg_cgroup_link objects that we'll need */ 672 /* Allocate all the cgrp_cset_link objects that we'll need */
659 if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { 673 if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
660 kfree(res); 674 kfree(cset);
661 return NULL; 675 return NULL;
662 } 676 }
663 677
664 atomic_set(&res->refcount, 1); 678 atomic_set(&cset->refcount, 1);
665 INIT_LIST_HEAD(&res->cg_links); 679 INIT_LIST_HEAD(&cset->cgrp_links);
666 INIT_LIST_HEAD(&res->tasks); 680 INIT_LIST_HEAD(&cset->tasks);
667 INIT_HLIST_NODE(&res->hlist); 681 INIT_HLIST_NODE(&cset->hlist);
668 682
669 /* Copy the set of subsystem state objects generated in 683 /* Copy the set of subsystem state objects generated in
670 * find_existing_css_set() */ 684 * find_existing_css_set() */
671 memcpy(res->subsys, template, sizeof(res->subsys)); 685 memcpy(cset->subsys, template, sizeof(cset->subsys));
672 686
673 write_lock(&css_set_lock); 687 write_lock(&css_set_lock);
674 /* Add reference counts and links from the new css_set. */ 688 /* Add reference counts and links from the new css_set. */
675 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { 689 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
676 struct cgroup *c = link->cgrp; 690 struct cgroup *c = link->cgrp;
691
677 if (c->root == cgrp->root) 692 if (c->root == cgrp->root)
678 c = cgrp; 693 c = cgrp;
679 link_css_set(&tmp_cg_links, res, c); 694 link_css_set(&tmp_links, cset, c);
680 } 695 }
681 696
682 BUG_ON(!list_empty(&tmp_cg_links)); 697 BUG_ON(!list_empty(&tmp_links));
683 698
684 css_set_count++; 699 css_set_count++;
685 700
686 /* Add this cgroup group to the hash table */ 701 /* Add this cgroup group to the hash table */
687 key = css_set_hash(res->subsys); 702 key = css_set_hash(cset->subsys);
688 hash_add(css_set_table, &res->hlist, key); 703 hash_add(css_set_table, &cset->hlist, key);
689 704
690 write_unlock(&css_set_lock); 705 write_unlock(&css_set_lock);
691 706
692 return res; 707 return cset;
693} 708}
694 709
695/* 710/*
@@ -699,7 +714,7 @@ static struct css_set *find_css_set(
699static struct cgroup *task_cgroup_from_root(struct task_struct *task, 714static struct cgroup *task_cgroup_from_root(struct task_struct *task,
700 struct cgroupfs_root *root) 715 struct cgroupfs_root *root)
701{ 716{
702 struct css_set *css; 717 struct css_set *cset;
703 struct cgroup *res = NULL; 718 struct cgroup *res = NULL;
704 719
705 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 720 BUG_ON(!mutex_is_locked(&cgroup_mutex));
@@ -709,13 +724,15 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
709 * task can't change groups, so the only thing that can happen 724 * task can't change groups, so the only thing that can happen
710 * is that it exits and its css is set back to init_css_set. 725 * is that it exits and its css is set back to init_css_set.
711 */ 726 */
712 css = task->cgroups; 727 cset = task_css_set(task);
713 if (css == &init_css_set) { 728 if (cset == &init_css_set) {
714 res = &root->top_cgroup; 729 res = &root->top_cgroup;
715 } else { 730 } else {
716 struct cg_cgroup_link *link; 731 struct cgrp_cset_link *link;
717 list_for_each_entry(link, &css->cg_links, cg_link_list) { 732
733 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
718 struct cgroup *c = link->cgrp; 734 struct cgroup *c = link->cgrp;
735
719 if (c->root == root) { 736 if (c->root == root) {
720 res = c; 737 res = c;
721 break; 738 break;
@@ -828,14 +845,14 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
828 845
829static void cgroup_free_fn(struct work_struct *work) 846static void cgroup_free_fn(struct work_struct *work)
830{ 847{
831 struct cgroup *cgrp = container_of(work, struct cgroup, free_work); 848 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
832 struct cgroup_subsys *ss; 849 struct cgroup_subsys *ss;
833 850
834 mutex_lock(&cgroup_mutex); 851 mutex_lock(&cgroup_mutex);
835 /* 852 /*
836 * Release the subsystem state objects. 853 * Release the subsystem state objects.
837 */ 854 */
838 for_each_subsys(cgrp->root, ss) 855 for_each_root_subsys(cgrp->root, ss)
839 ss->css_free(cgrp); 856 ss->css_free(cgrp);
840 857
841 cgrp->root->number_of_cgroups--; 858 cgrp->root->number_of_cgroups--;
@@ -873,7 +890,8 @@ static void cgroup_free_rcu(struct rcu_head *head)
873{ 890{
874 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); 891 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
875 892
876 schedule_work(&cgrp->free_work); 893 INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
894 schedule_work(&cgrp->destroy_work);
877} 895}
878 896
879static void cgroup_diput(struct dentry *dentry, struct inode *inode) 897static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -882,7 +900,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
882 if (S_ISDIR(inode->i_mode)) { 900 if (S_ISDIR(inode->i_mode)) {
883 struct cgroup *cgrp = dentry->d_fsdata; 901 struct cgroup *cgrp = dentry->d_fsdata;
884 902
885 BUG_ON(!(cgroup_is_removed(cgrp))); 903 BUG_ON(!(cgroup_is_dead(cgrp)));
886 call_rcu(&cgrp->rcu_head, cgroup_free_rcu); 904 call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
887 } else { 905 } else {
888 struct cfent *cfe = __d_cfe(dentry); 906 struct cfent *cfe = __d_cfe(dentry);
@@ -950,7 +968,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
950 struct cgroup *cgrp = __d_cgrp(dir); 968 struct cgroup *cgrp = __d_cgrp(dir);
951 struct cgroup_subsys *ss; 969 struct cgroup_subsys *ss;
952 970
953 for_each_subsys(cgrp->root, ss) { 971 for_each_root_subsys(cgrp->root, ss) {
954 struct cftype_set *set; 972 struct cftype_set *set;
955 if (!test_bit(ss->subsys_id, &subsys_mask)) 973 if (!test_bit(ss->subsys_id, &subsys_mask))
956 continue; 974 continue;
@@ -988,30 +1006,23 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
988 * returns an error, no reference counts are touched. 1006 * returns an error, no reference counts are touched.
989 */ 1007 */
990static int rebind_subsystems(struct cgroupfs_root *root, 1008static int rebind_subsystems(struct cgroupfs_root *root,
991 unsigned long final_subsys_mask) 1009 unsigned long added_mask, unsigned removed_mask)
992{ 1010{
993 unsigned long added_mask, removed_mask;
994 struct cgroup *cgrp = &root->top_cgroup; 1011 struct cgroup *cgrp = &root->top_cgroup;
1012 struct cgroup_subsys *ss;
995 int i; 1013 int i;
996 1014
997 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1015 BUG_ON(!mutex_is_locked(&cgroup_mutex));
998 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 1016 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
999 1017
1000 removed_mask = root->actual_subsys_mask & ~final_subsys_mask;
1001 added_mask = final_subsys_mask & ~root->actual_subsys_mask;
1002 /* Check that any added subsystems are currently free */ 1018 /* Check that any added subsystems are currently free */
1003 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1019 for_each_subsys(ss, i) {
1004 unsigned long bit = 1UL << i; 1020 unsigned long bit = 1UL << i;
1005 struct cgroup_subsys *ss = subsys[i]; 1021
1006 if (!(bit & added_mask)) 1022 if (!(bit & added_mask))
1007 continue; 1023 continue;
1008 /* 1024
1009 * Nobody should tell us to do a subsys that doesn't exist: 1025 if (ss->root != &cgroup_dummy_root) {
1010 * parse_cgroupfs_options should catch that case and refcounts
1011 * ensure that subsystems won't disappear once selected.
1012 */
1013 BUG_ON(ss == NULL);
1014 if (ss->root != &rootnode) {
1015 /* Subsystem isn't free */ 1026 /* Subsystem isn't free */
1016 return -EBUSY; 1027 return -EBUSY;
1017 } 1028 }
@@ -1025,38 +1036,41 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1025 return -EBUSY; 1036 return -EBUSY;
1026 1037
1027 /* Process each subsystem */ 1038 /* Process each subsystem */
1028 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1039 for_each_subsys(ss, i) {
1029 struct cgroup_subsys *ss = subsys[i];
1030 unsigned long bit = 1UL << i; 1040 unsigned long bit = 1UL << i;
1041
1031 if (bit & added_mask) { 1042 if (bit & added_mask) {
1032 /* We're binding this subsystem to this hierarchy */ 1043 /* We're binding this subsystem to this hierarchy */
1033 BUG_ON(ss == NULL);
1034 BUG_ON(cgrp->subsys[i]); 1044 BUG_ON(cgrp->subsys[i]);
1035 BUG_ON(!dummytop->subsys[i]); 1045 BUG_ON(!cgroup_dummy_top->subsys[i]);
1036 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 1046 BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top);
1037 cgrp->subsys[i] = dummytop->subsys[i]; 1047
1048 cgrp->subsys[i] = cgroup_dummy_top->subsys[i];
1038 cgrp->subsys[i]->cgroup = cgrp; 1049 cgrp->subsys[i]->cgroup = cgrp;
1039 list_move(&ss->sibling, &root->subsys_list); 1050 list_move(&ss->sibling, &root->subsys_list);
1040 ss->root = root; 1051 ss->root = root;
1041 if (ss->bind) 1052 if (ss->bind)
1042 ss->bind(cgrp); 1053 ss->bind(cgrp);
1054
1043 /* refcount was already taken, and we're keeping it */ 1055 /* refcount was already taken, and we're keeping it */
1056 root->subsys_mask |= bit;
1044 } else if (bit & removed_mask) { 1057 } else if (bit & removed_mask) {
1045 /* We're removing this subsystem */ 1058 /* We're removing this subsystem */
1046 BUG_ON(ss == NULL); 1059 BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]);
1047 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
1048 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1060 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
1061
1049 if (ss->bind) 1062 if (ss->bind)
1050 ss->bind(dummytop); 1063 ss->bind(cgroup_dummy_top);
1051 dummytop->subsys[i]->cgroup = dummytop; 1064 cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top;
1052 cgrp->subsys[i] = NULL; 1065 cgrp->subsys[i] = NULL;
1053 subsys[i]->root = &rootnode; 1066 cgroup_subsys[i]->root = &cgroup_dummy_root;
1054 list_move(&ss->sibling, &rootnode.subsys_list); 1067 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1068
1055 /* subsystem is now free - drop reference on module */ 1069 /* subsystem is now free - drop reference on module */
1056 module_put(ss->module); 1070 module_put(ss->module);
1057 } else if (bit & final_subsys_mask) { 1071 root->subsys_mask &= ~bit;
1072 } else if (bit & root->subsys_mask) {
1058 /* Subsystem state should already exist */ 1073 /* Subsystem state should already exist */
1059 BUG_ON(ss == NULL);
1060 BUG_ON(!cgrp->subsys[i]); 1074 BUG_ON(!cgrp->subsys[i]);
1061 /* 1075 /*
1062 * a refcount was taken, but we already had one, so 1076 * a refcount was taken, but we already had one, so
@@ -1071,7 +1085,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1071 BUG_ON(cgrp->subsys[i]); 1085 BUG_ON(cgrp->subsys[i]);
1072 } 1086 }
1073 } 1087 }
1074 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; 1088
1089 /*
1090 * Mark @root has finished binding subsystems. @root->subsys_mask
1091 * now matches the bound subsystems.
1092 */
1093 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1075 1094
1076 return 0; 1095 return 0;
1077} 1096}
@@ -1082,7 +1101,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1082 struct cgroup_subsys *ss; 1101 struct cgroup_subsys *ss;
1083 1102
1084 mutex_lock(&cgroup_root_mutex); 1103 mutex_lock(&cgroup_root_mutex);
1085 for_each_subsys(root, ss) 1104 for_each_root_subsys(root, ss)
1086 seq_printf(seq, ",%s", ss->name); 1105 seq_printf(seq, ",%s", ss->name);
1087 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1106 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1088 seq_puts(seq, ",sane_behavior"); 1107 seq_puts(seq, ",sane_behavior");
@@ -1114,18 +1133,19 @@ struct cgroup_sb_opts {
1114}; 1133};
1115 1134
1116/* 1135/*
1117 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call 1136 * Convert a hierarchy specifier into a bitmask of subsystems and
1118 * with cgroup_mutex held to protect the subsys[] array. This function takes 1137 * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
1119 * refcounts on subsystems to be used, unless it returns error, in which case 1138 * array. This function takes refcounts on subsystems to be used, unless it
1120 * no refcounts are taken. 1139 * returns error, in which case no refcounts are taken.
1121 */ 1140 */
1122static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1141static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1123{ 1142{
1124 char *token, *o = data; 1143 char *token, *o = data;
1125 bool all_ss = false, one_ss = false; 1144 bool all_ss = false, one_ss = false;
1126 unsigned long mask = (unsigned long)-1; 1145 unsigned long mask = (unsigned long)-1;
1127 int i;
1128 bool module_pin_failed = false; 1146 bool module_pin_failed = false;
1147 struct cgroup_subsys *ss;
1148 int i;
1129 1149
1130 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1150 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1131 1151
@@ -1202,10 +1222,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1202 continue; 1222 continue;
1203 } 1223 }
1204 1224
1205 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1225 for_each_subsys(ss, i) {
1206 struct cgroup_subsys *ss = subsys[i];
1207 if (ss == NULL)
1208 continue;
1209 if (strcmp(token, ss->name)) 1226 if (strcmp(token, ss->name))
1210 continue; 1227 continue;
1211 if (ss->disabled) 1228 if (ss->disabled)
@@ -1228,16 +1245,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1228 * otherwise if 'none', 'name=' and a subsystem name options 1245 * otherwise if 'none', 'name=' and a subsystem name options
1229 * were not specified, let's default to 'all' 1246 * were not specified, let's default to 'all'
1230 */ 1247 */
1231 if (all_ss || (!one_ss && !opts->none && !opts->name)) { 1248 if (all_ss || (!one_ss && !opts->none && !opts->name))
1232 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1249 for_each_subsys(ss, i)
1233 struct cgroup_subsys *ss = subsys[i]; 1250 if (!ss->disabled)
1234 if (ss == NULL) 1251 set_bit(i, &opts->subsys_mask);
1235 continue;
1236 if (ss->disabled)
1237 continue;
1238 set_bit(i, &opts->subsys_mask);
1239 }
1240 }
1241 1252
1242 /* Consistency checks */ 1253 /* Consistency checks */
1243 1254
@@ -1281,12 +1292,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1281 * take duplicate reference counts on a subsystem that's already used, 1292 * take duplicate reference counts on a subsystem that's already used,
1282 * but rebind_subsystems handles this case. 1293 * but rebind_subsystems handles this case.
1283 */ 1294 */
1284 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1295 for_each_subsys(ss, i) {
1285 unsigned long bit = 1UL << i; 1296 if (!(opts->subsys_mask & (1UL << i)))
1286
1287 if (!(bit & opts->subsys_mask))
1288 continue; 1297 continue;
1289 if (!try_module_get(subsys[i]->module)) { 1298 if (!try_module_get(cgroup_subsys[i]->module)) {
1290 module_pin_failed = true; 1299 module_pin_failed = true;
1291 break; 1300 break;
1292 } 1301 }
@@ -1303,7 +1312,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1303 1312
1304 if (!(bit & opts->subsys_mask)) 1313 if (!(bit & opts->subsys_mask))
1305 continue; 1314 continue;
1306 module_put(subsys[i]->module); 1315 module_put(cgroup_subsys[i]->module);
1307 } 1316 }
1308 return -ENOENT; 1317 return -ENOENT;
1309 } 1318 }
@@ -1313,14 +1322,14 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1313 1322
1314static void drop_parsed_module_refcounts(unsigned long subsys_mask) 1323static void drop_parsed_module_refcounts(unsigned long subsys_mask)
1315{ 1324{
1325 struct cgroup_subsys *ss;
1316 int i; 1326 int i;
1317 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1318 unsigned long bit = 1UL << i;
1319 1327
1320 if (!(bit & subsys_mask)) 1328 mutex_lock(&cgroup_mutex);
1321 continue; 1329 for_each_subsys(ss, i)
1322 module_put(subsys[i]->module); 1330 if (subsys_mask & (1UL << i))
1323 } 1331 module_put(cgroup_subsys[i]->module);
1332 mutex_unlock(&cgroup_mutex);
1324} 1333}
1325 1334
1326static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1335static int cgroup_remount(struct super_block *sb, int *flags, char *data)
@@ -1345,7 +1354,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1345 if (ret) 1354 if (ret)
1346 goto out_unlock; 1355 goto out_unlock;
1347 1356
1348 if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) 1357 if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1349 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1358 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1350 task_tgid_nr(current), current->comm); 1359 task_tgid_nr(current), current->comm);
1351 1360
@@ -1353,10 +1362,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1353 removed_mask = root->subsys_mask & ~opts.subsys_mask; 1362 removed_mask = root->subsys_mask & ~opts.subsys_mask;
1354 1363
1355 /* Don't allow flags or name to change at remount */ 1364 /* Don't allow flags or name to change at remount */
1356 if (opts.flags != root->flags || 1365 if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
1357 (opts.name && strcmp(opts.name, root->name))) { 1366 (opts.name && strcmp(opts.name, root->name))) {
1367 pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
1368 opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
1369 root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1358 ret = -EINVAL; 1370 ret = -EINVAL;
1359 drop_parsed_module_refcounts(opts.subsys_mask);
1360 goto out_unlock; 1371 goto out_unlock;
1361 } 1372 }
1362 1373
@@ -1367,11 +1378,10 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1367 */ 1378 */
1368 cgroup_clear_directory(cgrp->dentry, false, removed_mask); 1379 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1369 1380
1370 ret = rebind_subsystems(root, opts.subsys_mask); 1381 ret = rebind_subsystems(root, added_mask, removed_mask);
1371 if (ret) { 1382 if (ret) {
1372 /* rebind_subsystems failed, re-populate the removed files */ 1383 /* rebind_subsystems failed, re-populate the removed files */
1373 cgroup_populate_dir(cgrp, false, removed_mask); 1384 cgroup_populate_dir(cgrp, false, removed_mask);
1374 drop_parsed_module_refcounts(opts.subsys_mask);
1375 goto out_unlock; 1385 goto out_unlock;
1376 } 1386 }
1377 1387
@@ -1386,6 +1396,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1386 mutex_unlock(&cgroup_root_mutex); 1396 mutex_unlock(&cgroup_root_mutex);
1387 mutex_unlock(&cgroup_mutex); 1397 mutex_unlock(&cgroup_mutex);
1388 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1398 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1399 if (ret)
1400 drop_parsed_module_refcounts(opts.subsys_mask);
1389 return ret; 1401 return ret;
1390} 1402}
1391 1403
@@ -1401,11 +1413,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1401 INIT_LIST_HEAD(&cgrp->sibling); 1413 INIT_LIST_HEAD(&cgrp->sibling);
1402 INIT_LIST_HEAD(&cgrp->children); 1414 INIT_LIST_HEAD(&cgrp->children);
1403 INIT_LIST_HEAD(&cgrp->files); 1415 INIT_LIST_HEAD(&cgrp->files);
1404 INIT_LIST_HEAD(&cgrp->css_sets); 1416 INIT_LIST_HEAD(&cgrp->cset_links);
1405 INIT_LIST_HEAD(&cgrp->allcg_node);
1406 INIT_LIST_HEAD(&cgrp->release_list); 1417 INIT_LIST_HEAD(&cgrp->release_list);
1407 INIT_LIST_HEAD(&cgrp->pidlists); 1418 INIT_LIST_HEAD(&cgrp->pidlists);
1408 INIT_WORK(&cgrp->free_work, cgroup_free_fn);
1409 mutex_init(&cgrp->pidlist_mutex); 1419 mutex_init(&cgrp->pidlist_mutex);
1410 INIT_LIST_HEAD(&cgrp->event_list); 1420 INIT_LIST_HEAD(&cgrp->event_list);
1411 spin_lock_init(&cgrp->event_list_lock); 1421 spin_lock_init(&cgrp->event_list_lock);
@@ -1418,37 +1428,37 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1418 1428
1419 INIT_LIST_HEAD(&root->subsys_list); 1429 INIT_LIST_HEAD(&root->subsys_list);
1420 INIT_LIST_HEAD(&root->root_list); 1430 INIT_LIST_HEAD(&root->root_list);
1421 INIT_LIST_HEAD(&root->allcg_list);
1422 root->number_of_cgroups = 1; 1431 root->number_of_cgroups = 1;
1423 cgrp->root = root; 1432 cgrp->root = root;
1424 cgrp->name = &root_cgroup_name; 1433 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1425 init_cgroup_housekeeping(cgrp); 1434 init_cgroup_housekeeping(cgrp);
1426 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1427} 1435}
1428 1436
1429static bool init_root_id(struct cgroupfs_root *root) 1437static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
1430{ 1438{
1431 int ret = 0; 1439 int id;
1432 1440
1433 do { 1441 lockdep_assert_held(&cgroup_mutex);
1434 if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) 1442 lockdep_assert_held(&cgroup_root_mutex);
1435 return false; 1443
1436 spin_lock(&hierarchy_id_lock); 1444 id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
1437 /* Try to allocate the next unused ID */ 1445 GFP_KERNEL);
1438 ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, 1446 if (id < 0)
1439 &root->hierarchy_id); 1447 return id;
1440 if (ret == -ENOSPC) 1448
1441 /* Try again starting from 0 */ 1449 root->hierarchy_id = id;
1442 ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); 1450 return 0;
1443 if (!ret) { 1451}
1444 next_hierarchy_id = root->hierarchy_id + 1; 1452
1445 } else if (ret != -EAGAIN) { 1453static void cgroup_exit_root_id(struct cgroupfs_root *root)
1446 /* Can only get here if the 31-bit IDR is full ... */ 1454{
1447 BUG_ON(ret); 1455 lockdep_assert_held(&cgroup_mutex);
1448 } 1456 lockdep_assert_held(&cgroup_root_mutex);
1449 spin_unlock(&hierarchy_id_lock); 1457
1450 } while (ret); 1458 if (root->hierarchy_id) {
1451 return true; 1459 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1460 root->hierarchy_id = 0;
1461 }
1452} 1462}
1453 1463
1454static int cgroup_test_super(struct super_block *sb, void *data) 1464static int cgroup_test_super(struct super_block *sb, void *data)
@@ -1482,12 +1492,16 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1482 if (!root) 1492 if (!root)
1483 return ERR_PTR(-ENOMEM); 1493 return ERR_PTR(-ENOMEM);
1484 1494
1485 if (!init_root_id(root)) {
1486 kfree(root);
1487 return ERR_PTR(-ENOMEM);
1488 }
1489 init_cgroup_root(root); 1495 init_cgroup_root(root);
1490 1496
1497 /*
1498 * We need to set @root->subsys_mask now so that @root can be
1499 * matched by cgroup_test_super() before it finishes
1500 * initialization; otherwise, competing mounts with the same
1501 * options may try to bind the same subsystems instead of waiting
1502 * for the first one leading to unexpected mount errors.
1503 * SUBSYS_BOUND will be set once actual binding is complete.
1504 */
1491 root->subsys_mask = opts->subsys_mask; 1505 root->subsys_mask = opts->subsys_mask;
1492 root->flags = opts->flags; 1506 root->flags = opts->flags;
1493 ida_init(&root->cgroup_ida); 1507 ida_init(&root->cgroup_ida);
@@ -1500,17 +1514,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1500 return root; 1514 return root;
1501} 1515}
1502 1516
1503static void cgroup_drop_root(struct cgroupfs_root *root) 1517static void cgroup_free_root(struct cgroupfs_root *root)
1504{ 1518{
1505 if (!root) 1519 if (root) {
1506 return; 1520 /* hierarhcy ID shoulid already have been released */
1521 WARN_ON_ONCE(root->hierarchy_id);
1507 1522
1508 BUG_ON(!root->hierarchy_id); 1523 ida_destroy(&root->cgroup_ida);
1509 spin_lock(&hierarchy_id_lock); 1524 kfree(root);
1510 ida_remove(&hierarchy_ida, root->hierarchy_id); 1525 }
1511 spin_unlock(&hierarchy_id_lock);
1512 ida_destroy(&root->cgroup_ida);
1513 kfree(root);
1514} 1526}
1515 1527
1516static int cgroup_set_super(struct super_block *sb, void *data) 1528static int cgroup_set_super(struct super_block *sb, void *data)
@@ -1597,7 +1609,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1597 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); 1609 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
1598 if (IS_ERR(sb)) { 1610 if (IS_ERR(sb)) {
1599 ret = PTR_ERR(sb); 1611 ret = PTR_ERR(sb);
1600 cgroup_drop_root(opts.new_root); 1612 cgroup_free_root(opts.new_root);
1601 goto drop_modules; 1613 goto drop_modules;
1602 } 1614 }
1603 1615
@@ -1605,12 +1617,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1605 BUG_ON(!root); 1617 BUG_ON(!root);
1606 if (root == opts.new_root) { 1618 if (root == opts.new_root) {
1607 /* We used the new root structure, so this is a new hierarchy */ 1619 /* We used the new root structure, so this is a new hierarchy */
1608 struct list_head tmp_cg_links; 1620 struct list_head tmp_links;
1609 struct cgroup *root_cgrp = &root->top_cgroup; 1621 struct cgroup *root_cgrp = &root->top_cgroup;
1610 struct cgroupfs_root *existing_root; 1622 struct cgroupfs_root *existing_root;
1611 const struct cred *cred; 1623 const struct cred *cred;
1612 int i; 1624 int i;
1613 struct css_set *cg; 1625 struct css_set *cset;
1614 1626
1615 BUG_ON(sb->s_root != NULL); 1627 BUG_ON(sb->s_root != NULL);
1616 1628
@@ -1637,13 +1649,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1637 * that's us. The worst that can happen is that we 1649 * that's us. The worst that can happen is that we
1638 * have some link structures left over 1650 * have some link structures left over
1639 */ 1651 */
1640 ret = allocate_cg_links(css_set_count, &tmp_cg_links); 1652 ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1641 if (ret) 1653 if (ret)
1642 goto unlock_drop; 1654 goto unlock_drop;
1643 1655
1644 ret = rebind_subsystems(root, root->subsys_mask); 1656 /* ID 0 is reserved for dummy root, 1 for unified hierarchy */
1657 ret = cgroup_init_root_id(root, 2, 0);
1658 if (ret)
1659 goto unlock_drop;
1660
1661 ret = rebind_subsystems(root, root->subsys_mask, 0);
1645 if (ret == -EBUSY) { 1662 if (ret == -EBUSY) {
1646 free_cg_links(&tmp_cg_links); 1663 free_cgrp_cset_links(&tmp_links);
1647 goto unlock_drop; 1664 goto unlock_drop;
1648 } 1665 }
1649 /* 1666 /*
@@ -1655,8 +1672,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1655 /* EBUSY should be the only error here */ 1672 /* EBUSY should be the only error here */
1656 BUG_ON(ret); 1673 BUG_ON(ret);
1657 1674
1658 list_add(&root->root_list, &roots); 1675 list_add(&root->root_list, &cgroup_roots);
1659 root_count++; 1676 cgroup_root_count++;
1660 1677
1661 sb->s_root->d_fsdata = root_cgrp; 1678 sb->s_root->d_fsdata = root_cgrp;
1662 root->top_cgroup.dentry = sb->s_root; 1679 root->top_cgroup.dentry = sb->s_root;
@@ -1664,11 +1681,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1664 /* Link the top cgroup in this hierarchy into all 1681 /* Link the top cgroup in this hierarchy into all
1665 * the css_set objects */ 1682 * the css_set objects */
1666 write_lock(&css_set_lock); 1683 write_lock(&css_set_lock);
1667 hash_for_each(css_set_table, i, cg, hlist) 1684 hash_for_each(css_set_table, i, cset, hlist)
1668 link_css_set(&tmp_cg_links, cg, root_cgrp); 1685 link_css_set(&tmp_links, cset, root_cgrp);
1669 write_unlock(&css_set_lock); 1686 write_unlock(&css_set_lock);
1670 1687
1671 free_cg_links(&tmp_cg_links); 1688 free_cgrp_cset_links(&tmp_links);
1672 1689
1673 BUG_ON(!list_empty(&root_cgrp->children)); 1690 BUG_ON(!list_empty(&root_cgrp->children));
1674 BUG_ON(root->number_of_cgroups != 1); 1691 BUG_ON(root->number_of_cgroups != 1);
@@ -1684,9 +1701,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1684 * We re-used an existing hierarchy - the new root (if 1701 * We re-used an existing hierarchy - the new root (if
1685 * any) is not needed 1702 * any) is not needed
1686 */ 1703 */
1687 cgroup_drop_root(opts.new_root); 1704 cgroup_free_root(opts.new_root);
1688 1705
1689 if (root->flags != opts.flags) { 1706 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1690 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1707 if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1691 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); 1708 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1692 ret = -EINVAL; 1709 ret = -EINVAL;
@@ -1705,6 +1722,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1705 return dget(sb->s_root); 1722 return dget(sb->s_root);
1706 1723
1707 unlock_drop: 1724 unlock_drop:
1725 cgroup_exit_root_id(root);
1708 mutex_unlock(&cgroup_root_mutex); 1726 mutex_unlock(&cgroup_root_mutex);
1709 mutex_unlock(&cgroup_mutex); 1727 mutex_unlock(&cgroup_mutex);
1710 mutex_unlock(&inode->i_mutex); 1728 mutex_unlock(&inode->i_mutex);
@@ -1721,9 +1739,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1721static void cgroup_kill_sb(struct super_block *sb) { 1739static void cgroup_kill_sb(struct super_block *sb) {
1722 struct cgroupfs_root *root = sb->s_fs_info; 1740 struct cgroupfs_root *root = sb->s_fs_info;
1723 struct cgroup *cgrp = &root->top_cgroup; 1741 struct cgroup *cgrp = &root->top_cgroup;
1742 struct cgrp_cset_link *link, *tmp_link;
1724 int ret; 1743 int ret;
1725 struct cg_cgroup_link *link;
1726 struct cg_cgroup_link *saved_link;
1727 1744
1728 BUG_ON(!root); 1745 BUG_ON(!root);
1729 1746
@@ -1734,36 +1751,39 @@ static void cgroup_kill_sb(struct super_block *sb) {
1734 mutex_lock(&cgroup_root_mutex); 1751 mutex_lock(&cgroup_root_mutex);
1735 1752
1736 /* Rebind all subsystems back to the default hierarchy */ 1753 /* Rebind all subsystems back to the default hierarchy */
1737 ret = rebind_subsystems(root, 0); 1754 if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
1738 /* Shouldn't be able to fail ... */ 1755 ret = rebind_subsystems(root, 0, root->subsys_mask);
1739 BUG_ON(ret); 1756 /* Shouldn't be able to fail ... */
1757 BUG_ON(ret);
1758 }
1740 1759
1741 /* 1760 /*
1742 * Release all the links from css_sets to this hierarchy's 1761 * Release all the links from cset_links to this hierarchy's
1743 * root cgroup 1762 * root cgroup
1744 */ 1763 */
1745 write_lock(&css_set_lock); 1764 write_lock(&css_set_lock);
1746 1765
1747 list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, 1766 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1748 cgrp_link_list) { 1767 list_del(&link->cset_link);
1749 list_del(&link->cg_link_list); 1768 list_del(&link->cgrp_link);
1750 list_del(&link->cgrp_link_list);
1751 kfree(link); 1769 kfree(link);
1752 } 1770 }
1753 write_unlock(&css_set_lock); 1771 write_unlock(&css_set_lock);
1754 1772
1755 if (!list_empty(&root->root_list)) { 1773 if (!list_empty(&root->root_list)) {
1756 list_del(&root->root_list); 1774 list_del(&root->root_list);
1757 root_count--; 1775 cgroup_root_count--;
1758 } 1776 }
1759 1777
1778 cgroup_exit_root_id(root);
1779
1760 mutex_unlock(&cgroup_root_mutex); 1780 mutex_unlock(&cgroup_root_mutex);
1761 mutex_unlock(&cgroup_mutex); 1781 mutex_unlock(&cgroup_mutex);
1762 1782
1763 simple_xattrs_free(&cgrp->xattrs); 1783 simple_xattrs_free(&cgrp->xattrs);
1764 1784
1765 kill_litter_super(sb); 1785 kill_litter_super(sb);
1766 cgroup_drop_root(root); 1786 cgroup_free_root(root);
1767} 1787}
1768 1788
1769static struct file_system_type cgroup_fs_type = { 1789static struct file_system_type cgroup_fs_type = {
@@ -1825,6 +1845,38 @@ out:
1825} 1845}
1826EXPORT_SYMBOL_GPL(cgroup_path); 1846EXPORT_SYMBOL_GPL(cgroup_path);
1827 1847
1848/**
1849 * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy
1850 * @task: target task
1851 * @hierarchy_id: the hierarchy to look up @task's cgroup from
1852 * @buf: the buffer to write the path into
1853 * @buflen: the length of the buffer
1854 *
1855 * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and
1856 * copy its path into @buf. This function grabs cgroup_mutex and shouldn't
1857 * be used inside locks used by cgroup controller callbacks.
1858 */
1859int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id,
1860 char *buf, size_t buflen)
1861{
1862 struct cgroupfs_root *root;
1863 struct cgroup *cgrp = NULL;
1864 int ret = -ENOENT;
1865
1866 mutex_lock(&cgroup_mutex);
1867
1868 root = idr_find(&cgroup_hierarchy_idr, hierarchy_id);
1869 if (root) {
1870 cgrp = task_cgroup_from_root(task, root);
1871 ret = cgroup_path(cgrp, buf, buflen);
1872 }
1873
1874 mutex_unlock(&cgroup_mutex);
1875
1876 return ret;
1877}
1878EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy);
1879
1828/* 1880/*
1829 * Control Group taskset 1881 * Control Group taskset
1830 */ 1882 */
@@ -1910,10 +1962,11 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1910 * 1962 *
1911 * Must be called with cgroup_mutex and threadgroup locked. 1963 * Must be called with cgroup_mutex and threadgroup locked.
1912 */ 1964 */
1913static void cgroup_task_migrate(struct cgroup *oldcgrp, 1965static void cgroup_task_migrate(struct cgroup *old_cgrp,
1914 struct task_struct *tsk, struct css_set *newcg) 1966 struct task_struct *tsk,
1967 struct css_set *new_cset)
1915{ 1968{
1916 struct css_set *oldcg; 1969 struct css_set *old_cset;
1917 1970
1918 /* 1971 /*
1919 * We are synchronized through threadgroup_lock() against PF_EXITING 1972 * We are synchronized through threadgroup_lock() against PF_EXITING
@@ -1921,25 +1974,25 @@ static void cgroup_task_migrate(struct cgroup *oldcgrp,
1921 * css_set to init_css_set and dropping the old one. 1974 * css_set to init_css_set and dropping the old one.
1922 */ 1975 */
1923 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1976 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1924 oldcg = tsk->cgroups; 1977 old_cset = task_css_set(tsk);
1925 1978
1926 task_lock(tsk); 1979 task_lock(tsk);
1927 rcu_assign_pointer(tsk->cgroups, newcg); 1980 rcu_assign_pointer(tsk->cgroups, new_cset);
1928 task_unlock(tsk); 1981 task_unlock(tsk);
1929 1982
1930 /* Update the css_set linked lists if we're using them */ 1983 /* Update the css_set linked lists if we're using them */
1931 write_lock(&css_set_lock); 1984 write_lock(&css_set_lock);
1932 if (!list_empty(&tsk->cg_list)) 1985 if (!list_empty(&tsk->cg_list))
1933 list_move(&tsk->cg_list, &newcg->tasks); 1986 list_move(&tsk->cg_list, &new_cset->tasks);
1934 write_unlock(&css_set_lock); 1987 write_unlock(&css_set_lock);
1935 1988
1936 /* 1989 /*
1937 * We just gained a reference on oldcg by taking it from the task. As 1990 * We just gained a reference on old_cset by taking it from the
1938 * trading it for newcg is protected by cgroup_mutex, we're safe to drop 1991 * task. As trading it for new_cset is protected by cgroup_mutex,
1939 * it here; it will be freed under RCU. 1992 * we're safe to drop it here; it will be freed under RCU.
1940 */ 1993 */
1941 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1994 set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
1942 put_css_set(oldcg); 1995 put_css_set(old_cset);
1943} 1996}
1944 1997
1945/** 1998/**
@@ -2029,7 +2082,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2029 /* 2082 /*
2030 * step 1: check that we can legitimately attach to the cgroup. 2083 * step 1: check that we can legitimately attach to the cgroup.
2031 */ 2084 */
2032 for_each_subsys(root, ss) { 2085 for_each_root_subsys(root, ss) {
2033 if (ss->can_attach) { 2086 if (ss->can_attach) {
2034 retval = ss->can_attach(cgrp, &tset); 2087 retval = ss->can_attach(cgrp, &tset);
2035 if (retval) { 2088 if (retval) {
@@ -2044,8 +2097,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2044 * we use find_css_set, which allocates a new one if necessary. 2097 * we use find_css_set, which allocates a new one if necessary.
2045 */ 2098 */
2046 for (i = 0; i < group_size; i++) { 2099 for (i = 0; i < group_size; i++) {
2100 struct css_set *old_cset;
2101
2047 tc = flex_array_get(group, i); 2102 tc = flex_array_get(group, i);
2048 tc->cg = find_css_set(tc->task->cgroups, cgrp); 2103 old_cset = task_css_set(tc->task);
2104 tc->cg = find_css_set(old_cset, cgrp);
2049 if (!tc->cg) { 2105 if (!tc->cg) {
2050 retval = -ENOMEM; 2106 retval = -ENOMEM;
2051 goto out_put_css_set_refs; 2107 goto out_put_css_set_refs;
@@ -2066,7 +2122,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2066 /* 2122 /*
2067 * step 4: do subsystem attach callbacks. 2123 * step 4: do subsystem attach callbacks.
2068 */ 2124 */
2069 for_each_subsys(root, ss) { 2125 for_each_root_subsys(root, ss) {
2070 if (ss->attach) 2126 if (ss->attach)
2071 ss->attach(cgrp, &tset); 2127 ss->attach(cgrp, &tset);
2072 } 2128 }
@@ -2086,7 +2142,7 @@ out_put_css_set_refs:
2086 } 2142 }
2087out_cancel_attach: 2143out_cancel_attach:
2088 if (retval) { 2144 if (retval) {
2089 for_each_subsys(root, ss) { 2145 for_each_root_subsys(root, ss) {
2090 if (ss == failed_ss) 2146 if (ss == failed_ss)
2091 break; 2147 break;
2092 if (ss->cancel_attach) 2148 if (ss->cancel_attach)
@@ -2323,7 +2379,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
2323 struct cftype *cft = __d_cft(file->f_dentry); 2379 struct cftype *cft = __d_cft(file->f_dentry);
2324 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2380 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2325 2381
2326 if (cgroup_is_removed(cgrp)) 2382 if (cgroup_is_dead(cgrp))
2327 return -ENODEV; 2383 return -ENODEV;
2328 if (cft->write) 2384 if (cft->write)
2329 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 2385 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -2368,7 +2424,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2368 struct cftype *cft = __d_cft(file->f_dentry); 2424 struct cftype *cft = __d_cft(file->f_dentry);
2369 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2425 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2370 2426
2371 if (cgroup_is_removed(cgrp)) 2427 if (cgroup_is_dead(cgrp))
2372 return -ENODEV; 2428 return -ENODEV;
2373 2429
2374 if (cft->read) 2430 if (cft->read)
@@ -2435,10 +2491,12 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
2435 cft = __d_cft(file->f_dentry); 2491 cft = __d_cft(file->f_dentry);
2436 2492
2437 if (cft->read_map || cft->read_seq_string) { 2493 if (cft->read_map || cft->read_seq_string) {
2438 struct cgroup_seqfile_state *state = 2494 struct cgroup_seqfile_state *state;
2439 kzalloc(sizeof(*state), GFP_USER); 2495
2496 state = kzalloc(sizeof(*state), GFP_USER);
2440 if (!state) 2497 if (!state)
2441 return -ENOMEM; 2498 return -ENOMEM;
2499
2442 state->cft = cft; 2500 state->cft = cft;
2443 state->cgroup = __d_cgrp(file->f_dentry->d_parent); 2501 state->cgroup = __d_cgrp(file->f_dentry->d_parent);
2444 file->f_op = &cgroup_seqfile_operations; 2502 file->f_op = &cgroup_seqfile_operations;
@@ -2486,6 +2544,13 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2486 2544
2487 cgrp = __d_cgrp(old_dentry); 2545 cgrp = __d_cgrp(old_dentry);
2488 2546
2547 /*
2548 * This isn't a proper migration and its usefulness is very
2549 * limited. Disallow if sane_behavior.
2550 */
2551 if (cgroup_sane_behavior(cgrp))
2552 return -EPERM;
2553
2489 name = cgroup_alloc_name(new_dentry); 2554 name = cgroup_alloc_name(new_dentry);
2490 if (!name) 2555 if (!name)
2491 return -ENOMEM; 2556 return -ENOMEM;
@@ -2496,7 +2561,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2496 return ret; 2561 return ret;
2497 } 2562 }
2498 2563
2499 old_name = cgrp->name; 2564 old_name = rcu_dereference_protected(cgrp->name, true);
2500 rcu_assign_pointer(cgrp->name, name); 2565 rcu_assign_pointer(cgrp->name, name);
2501 2566
2502 kfree_rcu(old_name, rcu_head); 2567 kfree_rcu(old_name, rcu_head);
@@ -2747,58 +2812,78 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2747 return ret; 2812 return ret;
2748} 2813}
2749 2814
2750static DEFINE_MUTEX(cgroup_cft_mutex);
2751
2752static void cgroup_cfts_prepare(void) 2815static void cgroup_cfts_prepare(void)
2753 __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) 2816 __acquires(&cgroup_mutex)
2754{ 2817{
2755 /* 2818 /*
2756 * Thanks to the entanglement with vfs inode locking, we can't walk 2819 * Thanks to the entanglement with vfs inode locking, we can't walk
2757 * the existing cgroups under cgroup_mutex and create files. 2820 * the existing cgroups under cgroup_mutex and create files.
2758 * Instead, we increment reference on all cgroups and build list of 2821 * Instead, we use cgroup_for_each_descendant_pre() and drop RCU
2759 * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure 2822 * read lock before calling cgroup_addrm_files().
2760 * exclusive access to the field.
2761 */ 2823 */
2762 mutex_lock(&cgroup_cft_mutex);
2763 mutex_lock(&cgroup_mutex); 2824 mutex_lock(&cgroup_mutex);
2764} 2825}
2765 2826
2766static void cgroup_cfts_commit(struct cgroup_subsys *ss, 2827static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2767 struct cftype *cfts, bool is_add) 2828 struct cftype *cfts, bool is_add)
2768 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) 2829 __releases(&cgroup_mutex)
2769{ 2830{
2770 LIST_HEAD(pending); 2831 LIST_HEAD(pending);
2771 struct cgroup *cgrp, *n; 2832 struct cgroup *cgrp, *root = &ss->root->top_cgroup;
2833 struct super_block *sb = ss->root->sb;
2834 struct dentry *prev = NULL;
2835 struct inode *inode;
2836 u64 update_before;
2772 2837
2773 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2838 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2774 if (cfts && ss->root != &rootnode) { 2839 if (!cfts || ss->root == &cgroup_dummy_root ||
2775 list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { 2840 !atomic_inc_not_zero(&sb->s_active)) {
2776 dget(cgrp->dentry); 2841 mutex_unlock(&cgroup_mutex);
2777 list_add_tail(&cgrp->cft_q_node, &pending); 2842 return;
2778 }
2779 } 2843 }
2780 2844
2781 mutex_unlock(&cgroup_mutex);
2782
2783 /* 2845 /*
2784 * All new cgroups will see @cfts update on @ss->cftsets. Add/rm 2846 * All cgroups which are created after we drop cgroup_mutex will
2785 * files for all cgroups which were created before. 2847 * have the updated set of files, so we only need to update the
2848 * cgroups created before the current @cgroup_serial_nr_next.
2786 */ 2849 */
2787 list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { 2850 update_before = cgroup_serial_nr_next;
2788 struct inode *inode = cgrp->dentry->d_inode; 2851
2852 mutex_unlock(&cgroup_mutex);
2853
2854 /* @root always needs to be updated */
2855 inode = root->dentry->d_inode;
2856 mutex_lock(&inode->i_mutex);
2857 mutex_lock(&cgroup_mutex);
2858 cgroup_addrm_files(root, ss, cfts, is_add);
2859 mutex_unlock(&cgroup_mutex);
2860 mutex_unlock(&inode->i_mutex);
2861
2862 /* add/rm files for all cgroups created before */
2863 rcu_read_lock();
2864 cgroup_for_each_descendant_pre(cgrp, root) {
2865 if (cgroup_is_dead(cgrp))
2866 continue;
2867
2868 inode = cgrp->dentry->d_inode;
2869 dget(cgrp->dentry);
2870 rcu_read_unlock();
2871
2872 dput(prev);
2873 prev = cgrp->dentry;
2789 2874
2790 mutex_lock(&inode->i_mutex); 2875 mutex_lock(&inode->i_mutex);
2791 mutex_lock(&cgroup_mutex); 2876 mutex_lock(&cgroup_mutex);
2792 if (!cgroup_is_removed(cgrp)) 2877 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2793 cgroup_addrm_files(cgrp, ss, cfts, is_add); 2878 cgroup_addrm_files(cgrp, ss, cfts, is_add);
2794 mutex_unlock(&cgroup_mutex); 2879 mutex_unlock(&cgroup_mutex);
2795 mutex_unlock(&inode->i_mutex); 2880 mutex_unlock(&inode->i_mutex);
2796 2881
2797 list_del_init(&cgrp->cft_q_node); 2882 rcu_read_lock();
2798 dput(cgrp->dentry);
2799 } 2883 }
2800 2884 rcu_read_unlock();
2801 mutex_unlock(&cgroup_cft_mutex); 2885 dput(prev);
2886 deactivate_super(sb);
2802} 2887}
2803 2888
2804/** 2889/**
@@ -2853,7 +2938,8 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2853 2938
2854 list_for_each_entry(set, &ss->cftsets, node) { 2939 list_for_each_entry(set, &ss->cftsets, node) {
2855 if (set->cfts == cfts) { 2940 if (set->cfts == cfts) {
2856 list_del_init(&set->node); 2941 list_del(&set->node);
2942 kfree(set);
2857 cgroup_cfts_commit(ss, cfts, false); 2943 cgroup_cfts_commit(ss, cfts, false);
2858 return 0; 2944 return 0;
2859 } 2945 }
@@ -2872,12 +2958,11 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2872int cgroup_task_count(const struct cgroup *cgrp) 2958int cgroup_task_count(const struct cgroup *cgrp)
2873{ 2959{
2874 int count = 0; 2960 int count = 0;
2875 struct cg_cgroup_link *link; 2961 struct cgrp_cset_link *link;
2876 2962
2877 read_lock(&css_set_lock); 2963 read_lock(&css_set_lock);
2878 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { 2964 list_for_each_entry(link, &cgrp->cset_links, cset_link)
2879 count += atomic_read(&link->cg->refcount); 2965 count += atomic_read(&link->cset->refcount);
2880 }
2881 read_unlock(&css_set_lock); 2966 read_unlock(&css_set_lock);
2882 return count; 2967 return count;
2883} 2968}
@@ -2886,25 +2971,24 @@ int cgroup_task_count(const struct cgroup *cgrp)
2886 * Advance a list_head iterator. The iterator should be positioned at 2971 * Advance a list_head iterator. The iterator should be positioned at
2887 * the start of a css_set 2972 * the start of a css_set
2888 */ 2973 */
2889static void cgroup_advance_iter(struct cgroup *cgrp, 2974static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it)
2890 struct cgroup_iter *it)
2891{ 2975{
2892 struct list_head *l = it->cg_link; 2976 struct list_head *l = it->cset_link;
2893 struct cg_cgroup_link *link; 2977 struct cgrp_cset_link *link;
2894 struct css_set *cg; 2978 struct css_set *cset;
2895 2979
2896 /* Advance to the next non-empty css_set */ 2980 /* Advance to the next non-empty css_set */
2897 do { 2981 do {
2898 l = l->next; 2982 l = l->next;
2899 if (l == &cgrp->css_sets) { 2983 if (l == &cgrp->cset_links) {
2900 it->cg_link = NULL; 2984 it->cset_link = NULL;
2901 return; 2985 return;
2902 } 2986 }
2903 link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); 2987 link = list_entry(l, struct cgrp_cset_link, cset_link);
2904 cg = link->cg; 2988 cset = link->cset;
2905 } while (list_empty(&cg->tasks)); 2989 } while (list_empty(&cset->tasks));
2906 it->cg_link = l; 2990 it->cset_link = l;
2907 it->task = cg->tasks.next; 2991 it->task = cset->tasks.next;
2908} 2992}
2909 2993
2910/* 2994/*
@@ -2934,7 +3018,7 @@ static void cgroup_enable_task_cg_lists(void)
2934 * entry won't be deleted though the process has exited. 3018 * entry won't be deleted though the process has exited.
2935 */ 3019 */
2936 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) 3020 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
2937 list_add(&p->cg_list, &p->cgroups->tasks); 3021 list_add(&p->cg_list, &task_css_set(p)->tasks);
2938 task_unlock(p); 3022 task_unlock(p);
2939 } while_each_thread(g, p); 3023 } while_each_thread(g, p);
2940 read_unlock(&tasklist_lock); 3024 read_unlock(&tasklist_lock);
@@ -2942,12 +3026,67 @@ static void cgroup_enable_task_cg_lists(void)
2942} 3026}
2943 3027
2944/** 3028/**
3029 * cgroup_next_sibling - find the next sibling of a given cgroup
3030 * @pos: the current cgroup
3031 *
3032 * This function returns the next sibling of @pos and should be called
3033 * under RCU read lock. The only requirement is that @pos is accessible.
3034 * The next sibling is guaranteed to be returned regardless of @pos's
3035 * state.
3036 */
3037struct cgroup *cgroup_next_sibling(struct cgroup *pos)
3038{
3039 struct cgroup *next;
3040
3041 WARN_ON_ONCE(!rcu_read_lock_held());
3042
3043 /*
3044 * @pos could already have been removed. Once a cgroup is removed,
3045 * its ->sibling.next is no longer updated when its next sibling
3046 * changes. As CGRP_DEAD assertion is serialized and happens
3047 * before the cgroup is taken off the ->sibling list, if we see it
3048 * unasserted, it's guaranteed that the next sibling hasn't
3049 * finished its grace period even if it's already removed, and thus
3050 * safe to dereference from this RCU critical section. If
3051 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
3052 * to be visible as %true here.
3053 */
3054 if (likely(!cgroup_is_dead(pos))) {
3055 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3056 if (&next->sibling != &pos->parent->children)
3057 return next;
3058 return NULL;
3059 }
3060
3061 /*
3062 * Can't dereference the next pointer. Each cgroup is given a
3063 * monotonically increasing unique serial number and always
3064 * appended to the sibling list, so the next one can be found by
3065 * walking the parent's children until we see a cgroup with higher
3066 * serial number than @pos's.
3067 *
3068 * While this path can be slow, it's taken only when either the
3069 * current cgroup is removed or iteration and removal race.
3070 */
3071 list_for_each_entry_rcu(next, &pos->parent->children, sibling)
3072 if (next->serial_nr > pos->serial_nr)
3073 return next;
3074 return NULL;
3075}
3076EXPORT_SYMBOL_GPL(cgroup_next_sibling);
3077
3078/**
2945 * cgroup_next_descendant_pre - find the next descendant for pre-order walk 3079 * cgroup_next_descendant_pre - find the next descendant for pre-order walk
2946 * @pos: the current position (%NULL to initiate traversal) 3080 * @pos: the current position (%NULL to initiate traversal)
2947 * @cgroup: cgroup whose descendants to walk 3081 * @cgroup: cgroup whose descendants to walk
2948 * 3082 *
2949 * To be used by cgroup_for_each_descendant_pre(). Find the next 3083 * To be used by cgroup_for_each_descendant_pre(). Find the next
2950 * descendant to visit for pre-order traversal of @cgroup's descendants. 3084 * descendant to visit for pre-order traversal of @cgroup's descendants.
3085 *
3086 * While this function requires RCU read locking, it doesn't require the
3087 * whole traversal to be contained in a single RCU critical section. This
3088 * function will return the correct next descendant as long as both @pos
3089 * and @cgroup are accessible and @pos is a descendant of @cgroup.
2951 */ 3090 */
2952struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, 3091struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2953 struct cgroup *cgroup) 3092 struct cgroup *cgroup)
@@ -2967,11 +3106,9 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
2967 3106
2968 /* no child, visit my or the closest ancestor's next sibling */ 3107 /* no child, visit my or the closest ancestor's next sibling */
2969 while (pos != cgroup) { 3108 while (pos != cgroup) {
2970 next = list_entry_rcu(pos->sibling.next, struct cgroup, 3109 next = cgroup_next_sibling(pos);
2971 sibling); 3110 if (next)
2972 if (&next->sibling != &pos->parent->children)
2973 return next; 3111 return next;
2974
2975 pos = pos->parent; 3112 pos = pos->parent;
2976 } 3113 }
2977 3114
@@ -2986,6 +3123,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
2986 * Return the rightmost descendant of @pos. If there's no descendant, 3123 * Return the rightmost descendant of @pos. If there's no descendant,
2987 * @pos is returned. This can be used during pre-order traversal to skip 3124 * @pos is returned. This can be used during pre-order traversal to skip
2988 * subtree of @pos. 3125 * subtree of @pos.
3126 *
3127 * While this function requires RCU read locking, it doesn't require the
3128 * whole traversal to be contained in a single RCU critical section. This
3129 * function will return the correct rightmost descendant as long as @pos is
3130 * accessible.
2989 */ 3131 */
2990struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) 3132struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
2991{ 3133{
@@ -3025,6 +3167,11 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
3025 * 3167 *
3026 * To be used by cgroup_for_each_descendant_post(). Find the next 3168 * To be used by cgroup_for_each_descendant_post(). Find the next
3027 * descendant to visit for post-order traversal of @cgroup's descendants. 3169 * descendant to visit for post-order traversal of @cgroup's descendants.
3170 *
3171 * While this function requires RCU read locking, it doesn't require the
3172 * whole traversal to be contained in a single RCU critical section. This
3173 * function will return the correct next descendant as long as both @pos
3174 * and @cgroup are accessible and @pos is a descendant of @cgroup.
3028 */ 3175 */
3029struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, 3176struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3030 struct cgroup *cgroup) 3177 struct cgroup *cgroup)
@@ -3040,8 +3187,8 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
3040 } 3187 }
3041 3188
3042 /* if there's an unvisited sibling, visit its leftmost descendant */ 3189 /* if there's an unvisited sibling, visit its leftmost descendant */
3043 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 3190 next = cgroup_next_sibling(pos);
3044 if (&next->sibling != &pos->parent->children) 3191 if (next)
3045 return cgroup_leftmost_descendant(next); 3192 return cgroup_leftmost_descendant(next);
3046 3193
3047 /* no sibling left, visit parent */ 3194 /* no sibling left, visit parent */
@@ -3062,7 +3209,7 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
3062 cgroup_enable_task_cg_lists(); 3209 cgroup_enable_task_cg_lists();
3063 3210
3064 read_lock(&css_set_lock); 3211 read_lock(&css_set_lock);
3065 it->cg_link = &cgrp->css_sets; 3212 it->cset_link = &cgrp->cset_links;
3066 cgroup_advance_iter(cgrp, it); 3213 cgroup_advance_iter(cgrp, it);
3067} 3214}
3068 3215
@@ -3071,16 +3218,16 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
3071{ 3218{
3072 struct task_struct *res; 3219 struct task_struct *res;
3073 struct list_head *l = it->task; 3220 struct list_head *l = it->task;
3074 struct cg_cgroup_link *link; 3221 struct cgrp_cset_link *link;
3075 3222
3076 /* If the iterator cg is NULL, we have no tasks */ 3223 /* If the iterator cg is NULL, we have no tasks */
3077 if (!it->cg_link) 3224 if (!it->cset_link)
3078 return NULL; 3225 return NULL;
3079 res = list_entry(l, struct task_struct, cg_list); 3226 res = list_entry(l, struct task_struct, cg_list);
3080 /* Advance iterator to find next entry */ 3227 /* Advance iterator to find next entry */
3081 l = l->next; 3228 l = l->next;
3082 link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); 3229 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
3083 if (l == &link->cg->tasks) { 3230 if (l == &link->cset->tasks) {
3084 /* We reached the end of this task list - move on to 3231 /* We reached the end of this task list - move on to
3085 * the next cg_cgroup_link */ 3232 * the next cg_cgroup_link */
3086 cgroup_advance_iter(cgrp, it); 3233 cgroup_advance_iter(cgrp, it);
@@ -3411,7 +3558,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3411 } 3558 }
3412 } 3559 }
3413 /* entry not found; create a new one */ 3560 /* entry not found; create a new one */
3414 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 3561 l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3415 if (!l) { 3562 if (!l) {
3416 mutex_unlock(&cgrp->pidlist_mutex); 3563 mutex_unlock(&cgrp->pidlist_mutex);
3417 return l; 3564 return l;
@@ -3420,8 +3567,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3420 down_write(&l->mutex); 3567 down_write(&l->mutex);
3421 l->key.type = type; 3568 l->key.type = type;
3422 l->key.ns = get_pid_ns(ns); 3569 l->key.ns = get_pid_ns(ns);
3423 l->use_count = 0; /* don't increment here */
3424 l->list = NULL;
3425 l->owner = cgrp; 3570 l->owner = cgrp;
3426 list_add(&l->links, &cgrp->pidlists); 3571 list_add(&l->links, &cgrp->pidlists);
3427 mutex_unlock(&cgrp->pidlist_mutex); 3572 mutex_unlock(&cgrp->pidlist_mutex);
@@ -3727,6 +3872,23 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
3727} 3872}
3728 3873
3729/* 3874/*
3875 * When dput() is called asynchronously, if umount has been done and
3876 * then deactivate_super() in cgroup_free_fn() kills the superblock,
3877 * there's a small window that vfs will see the root dentry with non-zero
3878 * refcnt and trigger BUG().
3879 *
3880 * That's why we hold a reference before dput() and drop it right after.
3881 */
3882static void cgroup_dput(struct cgroup *cgrp)
3883{
3884 struct super_block *sb = cgrp->root->sb;
3885
3886 atomic_inc(&sb->s_active);
3887 dput(cgrp->dentry);
3888 deactivate_super(sb);
3889}
3890
3891/*
3730 * Unregister event and free resources. 3892 * Unregister event and free resources.
3731 * 3893 *
3732 * Gets called from workqueue. 3894 * Gets called from workqueue.
@@ -3746,7 +3908,7 @@ static void cgroup_event_remove(struct work_struct *work)
3746 3908
3747 eventfd_ctx_put(event->eventfd); 3909 eventfd_ctx_put(event->eventfd);
3748 kfree(event); 3910 kfree(event);
3749 dput(cgrp->dentry); 3911 cgroup_dput(cgrp);
3750} 3912}
3751 3913
3752/* 3914/*
@@ -3933,33 +4095,16 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
3933 return 0; 4095 return 0;
3934} 4096}
3935 4097
3936/* 4098static struct cftype cgroup_base_files[] = {
3937 * for the common functions, 'private' gives the type of file
3938 */
3939/* for hysterical raisins, we can't put this on the older files */
3940#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
3941static struct cftype files[] = {
3942 {
3943 .name = "tasks",
3944 .open = cgroup_tasks_open,
3945 .write_u64 = cgroup_tasks_write,
3946 .release = cgroup_pidlist_release,
3947 .mode = S_IRUGO | S_IWUSR,
3948 },
3949 { 4099 {
3950 .name = CGROUP_FILE_GENERIC_PREFIX "procs", 4100 .name = "cgroup.procs",
3951 .open = cgroup_procs_open, 4101 .open = cgroup_procs_open,
3952 .write_u64 = cgroup_procs_write, 4102 .write_u64 = cgroup_procs_write,
3953 .release = cgroup_pidlist_release, 4103 .release = cgroup_pidlist_release,
3954 .mode = S_IRUGO | S_IWUSR, 4104 .mode = S_IRUGO | S_IWUSR,
3955 }, 4105 },
3956 { 4106 {
3957 .name = "notify_on_release", 4107 .name = "cgroup.event_control",
3958 .read_u64 = cgroup_read_notify_on_release,
3959 .write_u64 = cgroup_write_notify_on_release,
3960 },
3961 {
3962 .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3963 .write_string = cgroup_write_event_control, 4108 .write_string = cgroup_write_event_control,
3964 .mode = S_IWUGO, 4109 .mode = S_IWUGO,
3965 }, 4110 },
@@ -3974,9 +4119,29 @@ static struct cftype files[] = {
3974 .flags = CFTYPE_ONLY_ON_ROOT, 4119 .flags = CFTYPE_ONLY_ON_ROOT,
3975 .read_seq_string = cgroup_sane_behavior_show, 4120 .read_seq_string = cgroup_sane_behavior_show,
3976 }, 4121 },
4122
4123 /*
4124 * Historical crazy stuff. These don't have "cgroup." prefix and
4125 * don't exist if sane_behavior. If you're depending on these, be
4126 * prepared to be burned.
4127 */
4128 {
4129 .name = "tasks",
4130 .flags = CFTYPE_INSANE, /* use "procs" instead */
4131 .open = cgroup_tasks_open,
4132 .write_u64 = cgroup_tasks_write,
4133 .release = cgroup_pidlist_release,
4134 .mode = S_IRUGO | S_IWUSR,
4135 },
4136 {
4137 .name = "notify_on_release",
4138 .flags = CFTYPE_INSANE,
4139 .read_u64 = cgroup_read_notify_on_release,
4140 .write_u64 = cgroup_write_notify_on_release,
4141 },
3977 { 4142 {
3978 .name = "release_agent", 4143 .name = "release_agent",
3979 .flags = CFTYPE_ONLY_ON_ROOT, 4144 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3980 .read_seq_string = cgroup_release_agent_show, 4145 .read_seq_string = cgroup_release_agent_show,
3981 .write_string = cgroup_release_agent_write, 4146 .write_string = cgroup_release_agent_write,
3982 .max_write_len = PATH_MAX, 4147 .max_write_len = PATH_MAX,
@@ -3997,13 +4162,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
3997 struct cgroup_subsys *ss; 4162 struct cgroup_subsys *ss;
3998 4163
3999 if (base_files) { 4164 if (base_files) {
4000 err = cgroup_addrm_files(cgrp, NULL, files, true); 4165 err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true);
4001 if (err < 0) 4166 if (err < 0)
4002 return err; 4167 return err;
4003 } 4168 }
4004 4169
4005 /* process cftsets of each subsystem */ 4170 /* process cftsets of each subsystem */
4006 for_each_subsys(cgrp->root, ss) { 4171 for_each_root_subsys(cgrp->root, ss) {
4007 struct cftype_set *set; 4172 struct cftype_set *set;
4008 if (!test_bit(ss->subsys_id, &subsys_mask)) 4173 if (!test_bit(ss->subsys_id, &subsys_mask))
4009 continue; 4174 continue;
@@ -4013,15 +4178,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
4013 } 4178 }
4014 4179
4015 /* This cgroup is ready now */ 4180 /* This cgroup is ready now */
4016 for_each_subsys(cgrp->root, ss) { 4181 for_each_root_subsys(cgrp->root, ss) {
4017 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4182 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4183 struct css_id *id = rcu_dereference_protected(css->id, true);
4184
4018 /* 4185 /*
4019 * Update id->css pointer and make this css visible from 4186 * Update id->css pointer and make this css visible from
4020 * CSS ID functions. This pointer will be dereferened 4187 * CSS ID functions. This pointer will be dereferened
4021 * from RCU-read-side without locks. 4188 * from RCU-read-side without locks.
4022 */ 4189 */
4023 if (css->id) 4190 if (id)
4024 rcu_assign_pointer(css->id->css, css); 4191 rcu_assign_pointer(id->css, css);
4025 } 4192 }
4026 4193
4027 return 0; 4194 return 0;
@@ -4031,12 +4198,16 @@ static void css_dput_fn(struct work_struct *work)
4031{ 4198{
4032 struct cgroup_subsys_state *css = 4199 struct cgroup_subsys_state *css =
4033 container_of(work, struct cgroup_subsys_state, dput_work); 4200 container_of(work, struct cgroup_subsys_state, dput_work);
4034 struct dentry *dentry = css->cgroup->dentry;
4035 struct super_block *sb = dentry->d_sb;
4036 4201
4037 atomic_inc(&sb->s_active); 4202 cgroup_dput(css->cgroup);
4038 dput(dentry); 4203}
4039 deactivate_super(sb); 4204
4205static void css_release(struct percpu_ref *ref)
4206{
4207 struct cgroup_subsys_state *css =
4208 container_of(ref, struct cgroup_subsys_state, refcnt);
4209
4210 schedule_work(&css->dput_work);
4040} 4211}
4041 4212
4042static void init_cgroup_css(struct cgroup_subsys_state *css, 4213static void init_cgroup_css(struct cgroup_subsys_state *css,
@@ -4044,10 +4215,9 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
4044 struct cgroup *cgrp) 4215 struct cgroup *cgrp)
4045{ 4216{
4046 css->cgroup = cgrp; 4217 css->cgroup = cgrp;
4047 atomic_set(&css->refcnt, 1);
4048 css->flags = 0; 4218 css->flags = 0;
4049 css->id = NULL; 4219 css->id = NULL;
4050 if (cgrp == dummytop) 4220 if (cgrp == cgroup_dummy_top)
4051 css->flags |= CSS_ROOT; 4221 css->flags |= CSS_ROOT;
4052 BUG_ON(cgrp->subsys[ss->subsys_id]); 4222 BUG_ON(cgrp->subsys[ss->subsys_id]);
4053 cgrp->subsys[ss->subsys_id] = css; 4223 cgrp->subsys[ss->subsys_id] = css;
@@ -4157,7 +4327,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4157 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 4327 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
4158 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4328 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
4159 4329
4160 for_each_subsys(root, ss) { 4330 for_each_root_subsys(root, ss) {
4161 struct cgroup_subsys_state *css; 4331 struct cgroup_subsys_state *css;
4162 4332
4163 css = ss->css_alloc(cgrp); 4333 css = ss->css_alloc(cgrp);
@@ -4165,7 +4335,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4165 err = PTR_ERR(css); 4335 err = PTR_ERR(css);
4166 goto err_free_all; 4336 goto err_free_all;
4167 } 4337 }
4338
4339 err = percpu_ref_init(&css->refcnt, css_release);
4340 if (err)
4341 goto err_free_all;
4342
4168 init_cgroup_css(css, ss, cgrp); 4343 init_cgroup_css(css, ss, cgrp);
4344
4169 if (ss->use_id) { 4345 if (ss->use_id) {
4170 err = alloc_css_id(ss, parent, cgrp); 4346 err = alloc_css_id(ss, parent, cgrp);
4171 if (err) 4347 if (err)
@@ -4183,20 +4359,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4183 goto err_free_all; 4359 goto err_free_all;
4184 lockdep_assert_held(&dentry->d_inode->i_mutex); 4360 lockdep_assert_held(&dentry->d_inode->i_mutex);
4185 4361
4362 cgrp->serial_nr = cgroup_serial_nr_next++;
4363
4186 /* allocation complete, commit to creation */ 4364 /* allocation complete, commit to creation */
4187 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4188 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4365 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4189 root->number_of_cgroups++; 4366 root->number_of_cgroups++;
4190 4367
4191 /* each css holds a ref to the cgroup's dentry */ 4368 /* each css holds a ref to the cgroup's dentry */
4192 for_each_subsys(root, ss) 4369 for_each_root_subsys(root, ss)
4193 dget(dentry); 4370 dget(dentry);
4194 4371
4195 /* hold a ref to the parent's dentry */ 4372 /* hold a ref to the parent's dentry */
4196 dget(parent->dentry); 4373 dget(parent->dentry);
4197 4374
4198 /* creation succeeded, notify subsystems */ 4375 /* creation succeeded, notify subsystems */
4199 for_each_subsys(root, ss) { 4376 for_each_root_subsys(root, ss) {
4200 err = online_css(ss, cgrp); 4377 err = online_css(ss, cgrp);
4201 if (err) 4378 if (err)
4202 goto err_destroy; 4379 goto err_destroy;
@@ -4221,9 +4398,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4221 return 0; 4398 return 0;
4222 4399
4223err_free_all: 4400err_free_all:
4224 for_each_subsys(root, ss) { 4401 for_each_root_subsys(root, ss) {
4225 if (cgrp->subsys[ss->subsys_id]) 4402 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4403
4404 if (css) {
4405 percpu_ref_cancel_init(&css->refcnt);
4226 ss->css_free(cgrp); 4406 ss->css_free(cgrp);
4407 }
4227 } 4408 }
4228 mutex_unlock(&cgroup_mutex); 4409 mutex_unlock(&cgroup_mutex);
4229 /* Release the reference count that we took on the superblock */ 4410 /* Release the reference count that we took on the superblock */
@@ -4251,63 +4432,120 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4251 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4432 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4252} 4433}
4253 4434
4435static void cgroup_css_killed(struct cgroup *cgrp)
4436{
4437 if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
4438 return;
4439
4440 /* percpu ref's of all css's are killed, kick off the next step */
4441 INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
4442 schedule_work(&cgrp->destroy_work);
4443}
4444
4445static void css_ref_killed_fn(struct percpu_ref *ref)
4446{
4447 struct cgroup_subsys_state *css =
4448 container_of(ref, struct cgroup_subsys_state, refcnt);
4449
4450 cgroup_css_killed(css->cgroup);
4451}
4452
4453/**
4454 * cgroup_destroy_locked - the first stage of cgroup destruction
4455 * @cgrp: cgroup to be destroyed
4456 *
4457 * css's make use of percpu refcnts whose killing latency shouldn't be
4458 * exposed to userland and are RCU protected. Also, cgroup core needs to
4459 * guarantee that css_tryget() won't succeed by the time ->css_offline() is
4460 * invoked. To satisfy all the requirements, destruction is implemented in
4461 * the following two steps.
4462 *
4463 * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
4464 * userland visible parts and start killing the percpu refcnts of
4465 * css's. Set up so that the next stage will be kicked off once all
4466 * the percpu refcnts are confirmed to be killed.
4467 *
4468 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
4469 * rest of destruction. Once all cgroup references are gone, the
4470 * cgroup is RCU-freed.
4471 *
4472 * This function implements s1. After this step, @cgrp is gone as far as
4473 * the userland is concerned and a new cgroup with the same name may be
4474 * created. As cgroup doesn't care about the names internally, this
4475 * doesn't cause any problem.
4476 */
4254static int cgroup_destroy_locked(struct cgroup *cgrp) 4477static int cgroup_destroy_locked(struct cgroup *cgrp)
4255 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4478 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4256{ 4479{
4257 struct dentry *d = cgrp->dentry; 4480 struct dentry *d = cgrp->dentry;
4258 struct cgroup *parent = cgrp->parent;
4259 struct cgroup_event *event, *tmp; 4481 struct cgroup_event *event, *tmp;
4260 struct cgroup_subsys *ss; 4482 struct cgroup_subsys *ss;
4483 bool empty;
4261 4484
4262 lockdep_assert_held(&d->d_inode->i_mutex); 4485 lockdep_assert_held(&d->d_inode->i_mutex);
4263 lockdep_assert_held(&cgroup_mutex); 4486 lockdep_assert_held(&cgroup_mutex);
4264 4487
4265 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) 4488 /*
4489 * css_set_lock synchronizes access to ->cset_links and prevents
4490 * @cgrp from being removed while __put_css_set() is in progress.
4491 */
4492 read_lock(&css_set_lock);
4493 empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children);
4494 read_unlock(&css_set_lock);
4495 if (!empty)
4266 return -EBUSY; 4496 return -EBUSY;
4267 4497
4268 /* 4498 /*
4269 * Block new css_tryget() by deactivating refcnt and mark @cgrp 4499 * Block new css_tryget() by killing css refcnts. cgroup core
4270 * removed. This makes future css_tryget() and child creation 4500 * guarantees that, by the time ->css_offline() is invoked, no new
4271 * attempts fail thus maintaining the removal conditions verified 4501 * css reference will be given out via css_tryget(). We can't
4272 * above. 4502 * simply call percpu_ref_kill() and proceed to offlining css's
4503 * because percpu_ref_kill() doesn't guarantee that the ref is seen
4504 * as killed on all CPUs on return.
4505 *
4506 * Use percpu_ref_kill_and_confirm() to get notifications as each
4507 * css is confirmed to be seen as killed on all CPUs. The
4508 * notification callback keeps track of the number of css's to be
4509 * killed and schedules cgroup_offline_fn() to perform the rest of
4510 * destruction once the percpu refs of all css's are confirmed to
4511 * be killed.
4273 */ 4512 */
4274 for_each_subsys(cgrp->root, ss) { 4513 atomic_set(&cgrp->css_kill_cnt, 1);
4514 for_each_root_subsys(cgrp->root, ss) {
4275 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4515 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4276 4516
4277 WARN_ON(atomic_read(&css->refcnt) < 0); 4517 /*
4278 atomic_add(CSS_DEACT_BIAS, &css->refcnt); 4518 * Killing would put the base ref, but we need to keep it
4279 } 4519 * alive until after ->css_offline.
4280 set_bit(CGRP_REMOVED, &cgrp->flags); 4520 */
4521 percpu_ref_get(&css->refcnt);
4281 4522
4282 /* tell subsystems to initate destruction */ 4523 atomic_inc(&cgrp->css_kill_cnt);
4283 for_each_subsys(cgrp->root, ss) 4524 percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
4284 offline_css(ss, cgrp); 4525 }
4526 cgroup_css_killed(cgrp);
4285 4527
4286 /* 4528 /*
4287 * Put all the base refs. Each css holds an extra reference to the 4529 * Mark @cgrp dead. This prevents further task migration and child
4288 * cgroup's dentry and cgroup removal proceeds regardless of css 4530 * creation by disabling cgroup_lock_live_group(). Note that
4289 * refs. On the last put of each css, whenever that may be, the 4531 * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to
4290 * extra dentry ref is put so that dentry destruction happens only 4532 * resume iteration after dropping RCU read lock. See
4291 * after all css's are released. 4533 * cgroup_next_sibling() for details.
4292 */ 4534 */
4293 for_each_subsys(cgrp->root, ss) 4535 set_bit(CGRP_DEAD, &cgrp->flags);
4294 css_put(cgrp->subsys[ss->subsys_id]);
4295 4536
4537 /* CGRP_DEAD is set, remove from ->release_list for the last time */
4296 raw_spin_lock(&release_list_lock); 4538 raw_spin_lock(&release_list_lock);
4297 if (!list_empty(&cgrp->release_list)) 4539 if (!list_empty(&cgrp->release_list))
4298 list_del_init(&cgrp->release_list); 4540 list_del_init(&cgrp->release_list);
4299 raw_spin_unlock(&release_list_lock); 4541 raw_spin_unlock(&release_list_lock);
4300 4542
4301 /* delete this cgroup from parent->children */ 4543 /*
4302 list_del_rcu(&cgrp->sibling); 4544 * Remove @cgrp directory. The removal puts the base ref but we
4303 list_del_init(&cgrp->allcg_node); 4545 * aren't quite done with @cgrp yet, so hold onto it.
4304 4546 */
4305 dget(d); 4547 dget(d);
4306 cgroup_d_remove_dir(d); 4548 cgroup_d_remove_dir(d);
4307 dput(d);
4308
4309 set_bit(CGRP_RELEASABLE, &parent->flags);
4310 check_for_release(parent);
4311 4549
4312 /* 4550 /*
4313 * Unregister events and notify userspace. 4551 * Unregister events and notify userspace.
@@ -4322,6 +4560,53 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4322 spin_unlock(&cgrp->event_list_lock); 4560 spin_unlock(&cgrp->event_list_lock);
4323 4561
4324 return 0; 4562 return 0;
4563};
4564
4565/**
4566 * cgroup_offline_fn - the second step of cgroup destruction
4567 * @work: cgroup->destroy_free_work
4568 *
4569 * This function is invoked from a work item for a cgroup which is being
4570 * destroyed after the percpu refcnts of all css's are guaranteed to be
4571 * seen as killed on all CPUs, and performs the rest of destruction. This
4572 * is the second step of destruction described in the comment above
4573 * cgroup_destroy_locked().
4574 */
4575static void cgroup_offline_fn(struct work_struct *work)
4576{
4577 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
4578 struct cgroup *parent = cgrp->parent;
4579 struct dentry *d = cgrp->dentry;
4580 struct cgroup_subsys *ss;
4581
4582 mutex_lock(&cgroup_mutex);
4583
4584 /*
4585 * css_tryget() is guaranteed to fail now. Tell subsystems to
4586 * initate destruction.
4587 */
4588 for_each_root_subsys(cgrp->root, ss)
4589 offline_css(ss, cgrp);
4590
4591 /*
4592 * Put the css refs from cgroup_destroy_locked(). Each css holds
4593 * an extra reference to the cgroup's dentry and cgroup removal
4594 * proceeds regardless of css refs. On the last put of each css,
4595 * whenever that may be, the extra dentry ref is put so that dentry
4596 * destruction happens only after all css's are released.
4597 */
4598 for_each_root_subsys(cgrp->root, ss)
4599 css_put(cgrp->subsys[ss->subsys_id]);
4600
4601 /* delete this cgroup from parent->children */
4602 list_del_rcu(&cgrp->sibling);
4603
4604 dput(d);
4605
4606 set_bit(CGRP_RELEASABLE, &parent->flags);
4607 check_for_release(parent);
4608
4609 mutex_unlock(&cgroup_mutex);
4325} 4610}
4326 4611
4327static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4612static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -4361,12 +4646,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4361 cgroup_init_cftsets(ss); 4646 cgroup_init_cftsets(ss);
4362 4647
4363 /* Create the top cgroup state for this subsystem */ 4648 /* Create the top cgroup state for this subsystem */
4364 list_add(&ss->sibling, &rootnode.subsys_list); 4649 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4365 ss->root = &rootnode; 4650 ss->root = &cgroup_dummy_root;
4366 css = ss->css_alloc(dummytop); 4651 css = ss->css_alloc(cgroup_dummy_top);
4367 /* We don't handle early failures gracefully */ 4652 /* We don't handle early failures gracefully */
4368 BUG_ON(IS_ERR(css)); 4653 BUG_ON(IS_ERR(css));
4369 init_cgroup_css(css, ss, dummytop); 4654 init_cgroup_css(css, ss, cgroup_dummy_top);
4370 4655
4371 /* Update the init_css_set to contain a subsys 4656 /* Update the init_css_set to contain a subsys
4372 * pointer to this state - since the subsystem is 4657 * pointer to this state - since the subsystem is
@@ -4381,7 +4666,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4381 * need to invoke fork callbacks here. */ 4666 * need to invoke fork callbacks here. */
4382 BUG_ON(!list_empty(&init_task.tasks)); 4667 BUG_ON(!list_empty(&init_task.tasks));
4383 4668
4384 BUG_ON(online_css(ss, dummytop)); 4669 BUG_ON(online_css(ss, cgroup_dummy_top));
4385 4670
4386 mutex_unlock(&cgroup_mutex); 4671 mutex_unlock(&cgroup_mutex);
4387 4672
@@ -4404,7 +4689,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4404 struct cgroup_subsys_state *css; 4689 struct cgroup_subsys_state *css;
4405 int i, ret; 4690 int i, ret;
4406 struct hlist_node *tmp; 4691 struct hlist_node *tmp;
4407 struct css_set *cg; 4692 struct css_set *cset;
4408 unsigned long key; 4693 unsigned long key;
4409 4694
4410 /* check name and function validity */ 4695 /* check name and function validity */
@@ -4427,7 +4712,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4427 */ 4712 */
4428 if (ss->module == NULL) { 4713 if (ss->module == NULL) {
4429 /* a sanity check */ 4714 /* a sanity check */
4430 BUG_ON(subsys[ss->subsys_id] != ss); 4715 BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
4431 return 0; 4716 return 0;
4432 } 4717 }
4433 4718
@@ -4435,26 +4720,26 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4435 cgroup_init_cftsets(ss); 4720 cgroup_init_cftsets(ss);
4436 4721
4437 mutex_lock(&cgroup_mutex); 4722 mutex_lock(&cgroup_mutex);
4438 subsys[ss->subsys_id] = ss; 4723 cgroup_subsys[ss->subsys_id] = ss;
4439 4724
4440 /* 4725 /*
4441 * no ss->css_alloc seems to need anything important in the ss 4726 * no ss->css_alloc seems to need anything important in the ss
4442 * struct, so this can happen first (i.e. before the rootnode 4727 * struct, so this can happen first (i.e. before the dummy root
4443 * attachment). 4728 * attachment).
4444 */ 4729 */
4445 css = ss->css_alloc(dummytop); 4730 css = ss->css_alloc(cgroup_dummy_top);
4446 if (IS_ERR(css)) { 4731 if (IS_ERR(css)) {
4447 /* failure case - need to deassign the subsys[] slot. */ 4732 /* failure case - need to deassign the cgroup_subsys[] slot. */
4448 subsys[ss->subsys_id] = NULL; 4733 cgroup_subsys[ss->subsys_id] = NULL;
4449 mutex_unlock(&cgroup_mutex); 4734 mutex_unlock(&cgroup_mutex);
4450 return PTR_ERR(css); 4735 return PTR_ERR(css);
4451 } 4736 }
4452 4737
4453 list_add(&ss->sibling, &rootnode.subsys_list); 4738 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4454 ss->root = &rootnode; 4739 ss->root = &cgroup_dummy_root;
4455 4740
4456 /* our new subsystem will be attached to the dummy hierarchy. */ 4741 /* our new subsystem will be attached to the dummy hierarchy. */
4457 init_cgroup_css(css, ss, dummytop); 4742 init_cgroup_css(css, ss, cgroup_dummy_top);
4458 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4743 /* init_idr must be after init_cgroup_css because it sets css->id. */
4459 if (ss->use_id) { 4744 if (ss->use_id) {
4460 ret = cgroup_init_idr(ss, css); 4745 ret = cgroup_init_idr(ss, css);
@@ -4471,21 +4756,21 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4471 * this is all done under the css_set_lock. 4756 * this is all done under the css_set_lock.
4472 */ 4757 */
4473 write_lock(&css_set_lock); 4758 write_lock(&css_set_lock);
4474 hash_for_each_safe(css_set_table, i, tmp, cg, hlist) { 4759 hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
4475 /* skip entries that we already rehashed */ 4760 /* skip entries that we already rehashed */
4476 if (cg->subsys[ss->subsys_id]) 4761 if (cset->subsys[ss->subsys_id])
4477 continue; 4762 continue;
4478 /* remove existing entry */ 4763 /* remove existing entry */
4479 hash_del(&cg->hlist); 4764 hash_del(&cset->hlist);
4480 /* set new value */ 4765 /* set new value */
4481 cg->subsys[ss->subsys_id] = css; 4766 cset->subsys[ss->subsys_id] = css;
4482 /* recompute hash and restore entry */ 4767 /* recompute hash and restore entry */
4483 key = css_set_hash(cg->subsys); 4768 key = css_set_hash(cset->subsys);
4484 hash_add(css_set_table, &cg->hlist, key); 4769 hash_add(css_set_table, &cset->hlist, key);
4485 } 4770 }
4486 write_unlock(&css_set_lock); 4771 write_unlock(&css_set_lock);
4487 4772
4488 ret = online_css(ss, dummytop); 4773 ret = online_css(ss, cgroup_dummy_top);
4489 if (ret) 4774 if (ret)
4490 goto err_unload; 4775 goto err_unload;
4491 4776
@@ -4511,7 +4796,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4511 */ 4796 */
4512void cgroup_unload_subsys(struct cgroup_subsys *ss) 4797void cgroup_unload_subsys(struct cgroup_subsys *ss)
4513{ 4798{
4514 struct cg_cgroup_link *link; 4799 struct cgrp_cset_link *link;
4515 4800
4516 BUG_ON(ss->module == NULL); 4801 BUG_ON(ss->module == NULL);
4517 4802
@@ -4520,45 +4805,46 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4520 * try_module_get in parse_cgroupfs_options should ensure that it 4805 * try_module_get in parse_cgroupfs_options should ensure that it
4521 * doesn't start being used while we're killing it off. 4806 * doesn't start being used while we're killing it off.
4522 */ 4807 */
4523 BUG_ON(ss->root != &rootnode); 4808 BUG_ON(ss->root != &cgroup_dummy_root);
4524 4809
4525 mutex_lock(&cgroup_mutex); 4810 mutex_lock(&cgroup_mutex);
4526 4811
4527 offline_css(ss, dummytop); 4812 offline_css(ss, cgroup_dummy_top);
4528 4813
4529 if (ss->use_id) 4814 if (ss->use_id)
4530 idr_destroy(&ss->idr); 4815 idr_destroy(&ss->idr);
4531 4816
4532 /* deassign the subsys_id */ 4817 /* deassign the subsys_id */
4533 subsys[ss->subsys_id] = NULL; 4818 cgroup_subsys[ss->subsys_id] = NULL;
4534 4819
4535 /* remove subsystem from rootnode's list of subsystems */ 4820 /* remove subsystem from the dummy root's list of subsystems */
4536 list_del_init(&ss->sibling); 4821 list_del_init(&ss->sibling);
4537 4822
4538 /* 4823 /*
4539 * disentangle the css from all css_sets attached to the dummytop. as 4824 * disentangle the css from all css_sets attached to the dummy
4540 * in loading, we need to pay our respects to the hashtable gods. 4825 * top. as in loading, we need to pay our respects to the hashtable
4826 * gods.
4541 */ 4827 */
4542 write_lock(&css_set_lock); 4828 write_lock(&css_set_lock);
4543 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { 4829 list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
4544 struct css_set *cg = link->cg; 4830 struct css_set *cset = link->cset;
4545 unsigned long key; 4831 unsigned long key;
4546 4832
4547 hash_del(&cg->hlist); 4833 hash_del(&cset->hlist);
4548 cg->subsys[ss->subsys_id] = NULL; 4834 cset->subsys[ss->subsys_id] = NULL;
4549 key = css_set_hash(cg->subsys); 4835 key = css_set_hash(cset->subsys);
4550 hash_add(css_set_table, &cg->hlist, key); 4836 hash_add(css_set_table, &cset->hlist, key);
4551 } 4837 }
4552 write_unlock(&css_set_lock); 4838 write_unlock(&css_set_lock);
4553 4839
4554 /* 4840 /*
4555 * remove subsystem's css from the dummytop and free it - need to 4841 * remove subsystem's css from the cgroup_dummy_top and free it -
4556 * free before marking as null because ss->css_free needs the 4842 * need to free before marking as null because ss->css_free needs
4557 * cgrp->subsys pointer to find their state. note that this also 4843 * the cgrp->subsys pointer to find their state. note that this
4558 * takes care of freeing the css_id. 4844 * also takes care of freeing the css_id.
4559 */ 4845 */
4560 ss->css_free(dummytop); 4846 ss->css_free(cgroup_dummy_top);
4561 dummytop->subsys[ss->subsys_id] = NULL; 4847 cgroup_dummy_top->subsys[ss->subsys_id] = NULL;
4562 4848
4563 mutex_unlock(&cgroup_mutex); 4849 mutex_unlock(&cgroup_mutex);
4564} 4850}
@@ -4572,30 +4858,25 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
4572 */ 4858 */
4573int __init cgroup_init_early(void) 4859int __init cgroup_init_early(void)
4574{ 4860{
4861 struct cgroup_subsys *ss;
4575 int i; 4862 int i;
4863
4576 atomic_set(&init_css_set.refcount, 1); 4864 atomic_set(&init_css_set.refcount, 1);
4577 INIT_LIST_HEAD(&init_css_set.cg_links); 4865 INIT_LIST_HEAD(&init_css_set.cgrp_links);
4578 INIT_LIST_HEAD(&init_css_set.tasks); 4866 INIT_LIST_HEAD(&init_css_set.tasks);
4579 INIT_HLIST_NODE(&init_css_set.hlist); 4867 INIT_HLIST_NODE(&init_css_set.hlist);
4580 css_set_count = 1; 4868 css_set_count = 1;
4581 init_cgroup_root(&rootnode); 4869 init_cgroup_root(&cgroup_dummy_root);
4582 root_count = 1; 4870 cgroup_root_count = 1;
4583 init_task.cgroups = &init_css_set; 4871 RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4584 4872
4585 init_css_set_link.cg = &init_css_set; 4873 init_cgrp_cset_link.cset = &init_css_set;
4586 init_css_set_link.cgrp = dummytop; 4874 init_cgrp_cset_link.cgrp = cgroup_dummy_top;
4587 list_add(&init_css_set_link.cgrp_link_list, 4875 list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links);
4588 &rootnode.top_cgroup.css_sets); 4876 list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
4589 list_add(&init_css_set_link.cg_link_list,
4590 &init_css_set.cg_links);
4591
4592 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4593 struct cgroup_subsys *ss = subsys[i];
4594
4595 /* at bootup time, we don't worry about modular subsystems */
4596 if (!ss || ss->module)
4597 continue;
4598 4877
4878 /* at bootup time, we don't worry about modular subsystems */
4879 for_each_builtin_subsys(ss, i) {
4599 BUG_ON(!ss->name); 4880 BUG_ON(!ss->name);
4600 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4881 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
4601 BUG_ON(!ss->css_alloc); 4882 BUG_ON(!ss->css_alloc);
@@ -4620,30 +4901,33 @@ int __init cgroup_init_early(void)
4620 */ 4901 */
4621int __init cgroup_init(void) 4902int __init cgroup_init(void)
4622{ 4903{
4623 int err; 4904 struct cgroup_subsys *ss;
4624 int i;
4625 unsigned long key; 4905 unsigned long key;
4906 int i, err;
4626 4907
4627 err = bdi_init(&cgroup_backing_dev_info); 4908 err = bdi_init(&cgroup_backing_dev_info);
4628 if (err) 4909 if (err)
4629 return err; 4910 return err;
4630 4911
4631 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4912 for_each_builtin_subsys(ss, i) {
4632 struct cgroup_subsys *ss = subsys[i];
4633
4634 /* at bootup time, we don't worry about modular subsystems */
4635 if (!ss || ss->module)
4636 continue;
4637 if (!ss->early_init) 4913 if (!ss->early_init)
4638 cgroup_init_subsys(ss); 4914 cgroup_init_subsys(ss);
4639 if (ss->use_id) 4915 if (ss->use_id)
4640 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); 4916 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
4641 } 4917 }
4642 4918
4919 /* allocate id for the dummy hierarchy */
4920 mutex_lock(&cgroup_mutex);
4921 mutex_lock(&cgroup_root_mutex);
4922
4643 /* Add init_css_set to the hash table */ 4923 /* Add init_css_set to the hash table */
4644 key = css_set_hash(init_css_set.subsys); 4924 key = css_set_hash(init_css_set.subsys);
4645 hash_add(css_set_table, &init_css_set.hlist, key); 4925 hash_add(css_set_table, &init_css_set.hlist, key);
4646 BUG_ON(!init_root_id(&rootnode)); 4926
4927 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
4928
4929 mutex_unlock(&cgroup_root_mutex);
4930 mutex_unlock(&cgroup_mutex);
4647 4931
4648 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4932 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4649 if (!cgroup_kobj) { 4933 if (!cgroup_kobj) {
@@ -4708,7 +4992,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
4708 int count = 0; 4992 int count = 0;
4709 4993
4710 seq_printf(m, "%d:", root->hierarchy_id); 4994 seq_printf(m, "%d:", root->hierarchy_id);
4711 for_each_subsys(root, ss) 4995 for_each_root_subsys(root, ss)
4712 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4996 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4713 if (strlen(root->name)) 4997 if (strlen(root->name))
4714 seq_printf(m, "%sname=%s", count ? "," : "", 4998 seq_printf(m, "%sname=%s", count ? "," : "",
@@ -4734,6 +5018,7 @@ out:
4734/* Display information about each subsystem and each hierarchy */ 5018/* Display information about each subsystem and each hierarchy */
4735static int proc_cgroupstats_show(struct seq_file *m, void *v) 5019static int proc_cgroupstats_show(struct seq_file *m, void *v)
4736{ 5020{
5021 struct cgroup_subsys *ss;
4737 int i; 5022 int i;
4738 5023
4739 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 5024 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
@@ -4743,14 +5028,12 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
4743 * subsys/hierarchy state. 5028 * subsys/hierarchy state.
4744 */ 5029 */
4745 mutex_lock(&cgroup_mutex); 5030 mutex_lock(&cgroup_mutex);
4746 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 5031
4747 struct cgroup_subsys *ss = subsys[i]; 5032 for_each_subsys(ss, i)
4748 if (ss == NULL)
4749 continue;
4750 seq_printf(m, "%s\t%d\t%d\t%d\n", 5033 seq_printf(m, "%s\t%d\t%d\t%d\n",
4751 ss->name, ss->root->hierarchy_id, 5034 ss->name, ss->root->hierarchy_id,
4752 ss->root->number_of_cgroups, !ss->disabled); 5035 ss->root->number_of_cgroups, !ss->disabled);
4753 } 5036
4754 mutex_unlock(&cgroup_mutex); 5037 mutex_unlock(&cgroup_mutex);
4755 return 0; 5038 return 0;
4756} 5039}
@@ -4786,8 +5069,8 @@ static const struct file_operations proc_cgroupstats_operations = {
4786void cgroup_fork(struct task_struct *child) 5069void cgroup_fork(struct task_struct *child)
4787{ 5070{
4788 task_lock(current); 5071 task_lock(current);
5072 get_css_set(task_css_set(current));
4789 child->cgroups = current->cgroups; 5073 child->cgroups = current->cgroups;
4790 get_css_set(child->cgroups);
4791 task_unlock(current); 5074 task_unlock(current);
4792 INIT_LIST_HEAD(&child->cg_list); 5075 INIT_LIST_HEAD(&child->cg_list);
4793} 5076}
@@ -4804,6 +5087,7 @@ void cgroup_fork(struct task_struct *child)
4804 */ 5087 */
4805void cgroup_post_fork(struct task_struct *child) 5088void cgroup_post_fork(struct task_struct *child)
4806{ 5089{
5090 struct cgroup_subsys *ss;
4807 int i; 5091 int i;
4808 5092
4809 /* 5093 /*
@@ -4821,7 +5105,7 @@ void cgroup_post_fork(struct task_struct *child)
4821 write_lock(&css_set_lock); 5105 write_lock(&css_set_lock);
4822 task_lock(child); 5106 task_lock(child);
4823 if (list_empty(&child->cg_list)) 5107 if (list_empty(&child->cg_list))
4824 list_add(&child->cg_list, &child->cgroups->tasks); 5108 list_add(&child->cg_list, &task_css_set(child)->tasks);
4825 task_unlock(child); 5109 task_unlock(child);
4826 write_unlock(&css_set_lock); 5110 write_unlock(&css_set_lock);
4827 } 5111 }
@@ -4840,12 +5124,9 @@ void cgroup_post_fork(struct task_struct *child)
4840 * of the array can be freed at module unload, so we 5124 * of the array can be freed at module unload, so we
4841 * can't touch that. 5125 * can't touch that.
4842 */ 5126 */
4843 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 5127 for_each_builtin_subsys(ss, i)
4844 struct cgroup_subsys *ss = subsys[i];
4845
4846 if (ss->fork) 5128 if (ss->fork)
4847 ss->fork(child); 5129 ss->fork(child);
4848 }
4849 } 5130 }
4850} 5131}
4851 5132
@@ -4886,7 +5167,8 @@ void cgroup_post_fork(struct task_struct *child)
4886 */ 5167 */
4887void cgroup_exit(struct task_struct *tsk, int run_callbacks) 5168void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4888{ 5169{
4889 struct css_set *cg; 5170 struct cgroup_subsys *ss;
5171 struct css_set *cset;
4890 int i; 5172 int i;
4891 5173
4892 /* 5174 /*
@@ -4903,36 +5185,32 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4903 5185
4904 /* Reassign the task to the init_css_set. */ 5186 /* Reassign the task to the init_css_set. */
4905 task_lock(tsk); 5187 task_lock(tsk);
4906 cg = tsk->cgroups; 5188 cset = task_css_set(tsk);
4907 tsk->cgroups = &init_css_set; 5189 RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
4908 5190
4909 if (run_callbacks && need_forkexit_callback) { 5191 if (run_callbacks && need_forkexit_callback) {
4910 /* 5192 /*
4911 * fork/exit callbacks are supported only for builtin 5193 * fork/exit callbacks are supported only for builtin
4912 * subsystems, see cgroup_post_fork() for details. 5194 * subsystems, see cgroup_post_fork() for details.
4913 */ 5195 */
4914 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 5196 for_each_builtin_subsys(ss, i) {
4915 struct cgroup_subsys *ss = subsys[i];
4916
4917 if (ss->exit) { 5197 if (ss->exit) {
4918 struct cgroup *old_cgrp = 5198 struct cgroup *old_cgrp = cset->subsys[i]->cgroup;
4919 rcu_dereference_raw(cg->subsys[i])->cgroup;
4920 struct cgroup *cgrp = task_cgroup(tsk, i); 5199 struct cgroup *cgrp = task_cgroup(tsk, i);
5200
4921 ss->exit(cgrp, old_cgrp, tsk); 5201 ss->exit(cgrp, old_cgrp, tsk);
4922 } 5202 }
4923 } 5203 }
4924 } 5204 }
4925 task_unlock(tsk); 5205 task_unlock(tsk);
4926 5206
4927 put_css_set_taskexit(cg); 5207 put_css_set_taskexit(cset);
4928} 5208}
4929 5209
4930static void check_for_release(struct cgroup *cgrp) 5210static void check_for_release(struct cgroup *cgrp)
4931{ 5211{
4932 /* All of these checks rely on RCU to keep the cgroup
4933 * structure alive */
4934 if (cgroup_is_releasable(cgrp) && 5212 if (cgroup_is_releasable(cgrp) &&
4935 !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) { 5213 list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
4936 /* 5214 /*
4937 * Control Group is currently removeable. If it's not 5215 * Control Group is currently removeable. If it's not
4938 * already queued for a userspace notification, queue 5216 * already queued for a userspace notification, queue
@@ -4941,7 +5219,7 @@ static void check_for_release(struct cgroup *cgrp)
4941 int need_schedule_work = 0; 5219 int need_schedule_work = 0;
4942 5220
4943 raw_spin_lock(&release_list_lock); 5221 raw_spin_lock(&release_list_lock);
4944 if (!cgroup_is_removed(cgrp) && 5222 if (!cgroup_is_dead(cgrp) &&
4945 list_empty(&cgrp->release_list)) { 5223 list_empty(&cgrp->release_list)) {
4946 list_add(&cgrp->release_list, &release_list); 5224 list_add(&cgrp->release_list, &release_list);
4947 need_schedule_work = 1; 5225 need_schedule_work = 1;
@@ -4952,34 +5230,6 @@ static void check_for_release(struct cgroup *cgrp)
4952 } 5230 }
4953} 5231}
4954 5232
4955/* Caller must verify that the css is not for root cgroup */
4956bool __css_tryget(struct cgroup_subsys_state *css)
4957{
4958 while (true) {
4959 int t, v;
4960
4961 v = css_refcnt(css);
4962 t = atomic_cmpxchg(&css->refcnt, v, v + 1);
4963 if (likely(t == v))
4964 return true;
4965 else if (t < 0)
4966 return false;
4967 cpu_relax();
4968 }
4969}
4970EXPORT_SYMBOL_GPL(__css_tryget);
4971
4972/* Caller must verify that the css is not for root cgroup */
4973void __css_put(struct cgroup_subsys_state *css)
4974{
4975 int v;
4976
4977 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
4978 if (v == 0)
4979 schedule_work(&css->dput_work);
4980}
4981EXPORT_SYMBOL_GPL(__css_put);
4982
4983/* 5233/*
4984 * Notify userspace when a cgroup is released, by running the 5234 * Notify userspace when a cgroup is released, by running the
4985 * configured release agent with the name of the cgroup (path 5235 * configured release agent with the name of the cgroup (path
@@ -5054,23 +5304,19 @@ static void cgroup_release_agent(struct work_struct *work)
5054 5304
5055static int __init cgroup_disable(char *str) 5305static int __init cgroup_disable(char *str)
5056{ 5306{
5057 int i; 5307 struct cgroup_subsys *ss;
5058 char *token; 5308 char *token;
5309 int i;
5059 5310
5060 while ((token = strsep(&str, ",")) != NULL) { 5311 while ((token = strsep(&str, ",")) != NULL) {
5061 if (!*token) 5312 if (!*token)
5062 continue; 5313 continue;
5063 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
5064 struct cgroup_subsys *ss = subsys[i];
5065
5066 /*
5067 * cgroup_disable, being at boot time, can't
5068 * know about module subsystems, so we don't
5069 * worry about them.
5070 */
5071 if (!ss || ss->module)
5072 continue;
5073 5314
5315 /*
5316 * cgroup_disable, being at boot time, can't know about
5317 * module subsystems, so we don't worry about them.
5318 */
5319 for_each_builtin_subsys(ss, i) {
5074 if (!strcmp(token, ss->name)) { 5320 if (!strcmp(token, ss->name)) {
5075 ss->disabled = 1; 5321 ss->disabled = 1;
5076 printk(KERN_INFO "Disabling %s control group" 5322 printk(KERN_INFO "Disabling %s control group"
@@ -5087,9 +5333,7 @@ __setup("cgroup_disable=", cgroup_disable);
5087 * Functons for CSS ID. 5333 * Functons for CSS ID.
5088 */ 5334 */
5089 5335
5090/* 5336/* to get ID other than 0, this should be called when !cgroup_is_dead() */
5091 *To get ID other than 0, this should be called when !cgroup_is_removed().
5092 */
5093unsigned short css_id(struct cgroup_subsys_state *css) 5337unsigned short css_id(struct cgroup_subsys_state *css)
5094{ 5338{
5095 struct css_id *cssid; 5339 struct css_id *cssid;
@@ -5099,7 +5343,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
5099 * on this or this is under rcu_read_lock(). Once css->id is allocated, 5343 * on this or this is under rcu_read_lock(). Once css->id is allocated,
5100 * it's unchanged until freed. 5344 * it's unchanged until freed.
5101 */ 5345 */
5102 cssid = rcu_dereference_check(css->id, css_refcnt(css)); 5346 cssid = rcu_dereference_raw(css->id);
5103 5347
5104 if (cssid) 5348 if (cssid)
5105 return cssid->id; 5349 return cssid->id;
@@ -5107,18 +5351,6 @@ unsigned short css_id(struct cgroup_subsys_state *css)
5107} 5351}
5108EXPORT_SYMBOL_GPL(css_id); 5352EXPORT_SYMBOL_GPL(css_id);
5109 5353
5110unsigned short css_depth(struct cgroup_subsys_state *css)
5111{
5112 struct css_id *cssid;
5113
5114 cssid = rcu_dereference_check(css->id, css_refcnt(css));
5115
5116 if (cssid)
5117 return cssid->depth;
5118 return 0;
5119}
5120EXPORT_SYMBOL_GPL(css_depth);
5121
5122/** 5354/**
5123 * css_is_ancestor - test "root" css is an ancestor of "child" 5355 * css_is_ancestor - test "root" css is an ancestor of "child"
5124 * @child: the css to be tested. 5356 * @child: the css to be tested.
@@ -5153,7 +5385,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
5153 5385
5154void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) 5386void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
5155{ 5387{
5156 struct css_id *id = css->id; 5388 struct css_id *id = rcu_dereference_protected(css->id, true);
5389
5157 /* When this is called before css_id initialization, id can be NULL */ 5390 /* When this is called before css_id initialization, id can be NULL */
5158 if (!id) 5391 if (!id)
5159 return; 5392 return;
@@ -5219,8 +5452,8 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
5219 return PTR_ERR(newid); 5452 return PTR_ERR(newid);
5220 5453
5221 newid->stack[0] = newid->id; 5454 newid->stack[0] = newid->id;
5222 newid->css = rootcss; 5455 RCU_INIT_POINTER(newid->css, rootcss);
5223 rootcss->id = newid; 5456 RCU_INIT_POINTER(rootcss->id, newid);
5224 return 0; 5457 return 0;
5225} 5458}
5226 5459
@@ -5234,7 +5467,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
5234 subsys_id = ss->subsys_id; 5467 subsys_id = ss->subsys_id;
5235 parent_css = parent->subsys[subsys_id]; 5468 parent_css = parent->subsys[subsys_id];
5236 child_css = child->subsys[subsys_id]; 5469 child_css = child->subsys[subsys_id];
5237 parent_id = parent_css->id; 5470 parent_id = rcu_dereference_protected(parent_css->id, true);
5238 depth = parent_id->depth + 1; 5471 depth = parent_id->depth + 1;
5239 5472
5240 child_id = get_new_cssid(ss, depth); 5473 child_id = get_new_cssid(ss, depth);
@@ -5299,7 +5532,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5299} 5532}
5300 5533
5301#ifdef CONFIG_CGROUP_DEBUG 5534#ifdef CONFIG_CGROUP_DEBUG
5302static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) 5535static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
5303{ 5536{
5304 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5537 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5305 5538
@@ -5309,48 +5542,43 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
5309 return css; 5542 return css;
5310} 5543}
5311 5544
5312static void debug_css_free(struct cgroup *cont) 5545static void debug_css_free(struct cgroup *cgrp)
5313{
5314 kfree(cont->subsys[debug_subsys_id]);
5315}
5316
5317static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
5318{ 5546{
5319 return atomic_read(&cont->count); 5547 kfree(cgrp->subsys[debug_subsys_id]);
5320} 5548}
5321 5549
5322static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) 5550static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft)
5323{ 5551{
5324 return cgroup_task_count(cont); 5552 return cgroup_task_count(cgrp);
5325} 5553}
5326 5554
5327static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) 5555static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft)
5328{ 5556{
5329 return (u64)(unsigned long)current->cgroups; 5557 return (u64)(unsigned long)current->cgroups;
5330} 5558}
5331 5559
5332static u64 current_css_set_refcount_read(struct cgroup *cont, 5560static u64 current_css_set_refcount_read(struct cgroup *cgrp,
5333 struct cftype *cft) 5561 struct cftype *cft)
5334{ 5562{
5335 u64 count; 5563 u64 count;
5336 5564
5337 rcu_read_lock(); 5565 rcu_read_lock();
5338 count = atomic_read(&current->cgroups->refcount); 5566 count = atomic_read(&task_css_set(current)->refcount);
5339 rcu_read_unlock(); 5567 rcu_read_unlock();
5340 return count; 5568 return count;
5341} 5569}
5342 5570
5343static int current_css_set_cg_links_read(struct cgroup *cont, 5571static int current_css_set_cg_links_read(struct cgroup *cgrp,
5344 struct cftype *cft, 5572 struct cftype *cft,
5345 struct seq_file *seq) 5573 struct seq_file *seq)
5346{ 5574{
5347 struct cg_cgroup_link *link; 5575 struct cgrp_cset_link *link;
5348 struct css_set *cg; 5576 struct css_set *cset;
5349 5577
5350 read_lock(&css_set_lock); 5578 read_lock(&css_set_lock);
5351 rcu_read_lock(); 5579 rcu_read_lock();
5352 cg = rcu_dereference(current->cgroups); 5580 cset = rcu_dereference(current->cgroups);
5353 list_for_each_entry(link, &cg->cg_links, cg_link_list) { 5581 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
5354 struct cgroup *c = link->cgrp; 5582 struct cgroup *c = link->cgrp;
5355 const char *name; 5583 const char *name;
5356 5584
@@ -5367,19 +5595,19 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
5367} 5595}
5368 5596
5369#define MAX_TASKS_SHOWN_PER_CSS 25 5597#define MAX_TASKS_SHOWN_PER_CSS 25
5370static int cgroup_css_links_read(struct cgroup *cont, 5598static int cgroup_css_links_read(struct cgroup *cgrp,
5371 struct cftype *cft, 5599 struct cftype *cft,
5372 struct seq_file *seq) 5600 struct seq_file *seq)
5373{ 5601{
5374 struct cg_cgroup_link *link; 5602 struct cgrp_cset_link *link;
5375 5603
5376 read_lock(&css_set_lock); 5604 read_lock(&css_set_lock);
5377 list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { 5605 list_for_each_entry(link, &cgrp->cset_links, cset_link) {
5378 struct css_set *cg = link->cg; 5606 struct css_set *cset = link->cset;
5379 struct task_struct *task; 5607 struct task_struct *task;
5380 int count = 0; 5608 int count = 0;
5381 seq_printf(seq, "css_set %p\n", cg); 5609 seq_printf(seq, "css_set %p\n", cset);
5382 list_for_each_entry(task, &cg->tasks, cg_list) { 5610 list_for_each_entry(task, &cset->tasks, cg_list) {
5383 if (count++ > MAX_TASKS_SHOWN_PER_CSS) { 5611 if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
5384 seq_puts(seq, " ...\n"); 5612 seq_puts(seq, " ...\n");
5385 break; 5613 break;
@@ -5400,10 +5628,6 @@ static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
5400 5628
5401static struct cftype debug_files[] = { 5629static struct cftype debug_files[] = {
5402 { 5630 {
5403 .name = "cgroup_refcount",
5404 .read_u64 = cgroup_refcount_read,
5405 },
5406 {
5407 .name = "taskcount", 5631 .name = "taskcount",
5408 .read_u64 = debug_taskcount_read, 5632 .read_u64 = debug_taskcount_read,
5409 }, 5633 },
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 65349f07b878..383f8231e436 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -15,7 +15,6 @@
15 */ 15 */
16 16
17#include <linux/context_tracking.h> 17#include <linux/context_tracking.h>
18#include <linux/kvm_host.h>
19#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
20#include <linux/sched.h> 19#include <linux/sched.h>
21#include <linux/hardirq.h> 20#include <linux/hardirq.h>
@@ -71,6 +70,46 @@ void user_enter(void)
71 local_irq_restore(flags); 70 local_irq_restore(flags);
72} 71}
73 72
73#ifdef CONFIG_PREEMPT
74/**
75 * preempt_schedule_context - preempt_schedule called by tracing
76 *
77 * The tracing infrastructure uses preempt_enable_notrace to prevent
78 * recursion and tracing preempt enabling caused by the tracing
79 * infrastructure itself. But as tracing can happen in areas coming
80 * from userspace or just about to enter userspace, a preempt enable
81 * can occur before user_exit() is called. This will cause the scheduler
82 * to be called when the system is still in usermode.
83 *
84 * To prevent this, the preempt_enable_notrace will use this function
85 * instead of preempt_schedule() to exit user context if needed before
86 * calling the scheduler.
87 */
88void __sched notrace preempt_schedule_context(void)
89{
90 struct thread_info *ti = current_thread_info();
91 enum ctx_state prev_ctx;
92
93 if (likely(ti->preempt_count || irqs_disabled()))
94 return;
95
96 /*
97 * Need to disable preemption in case user_exit() is traced
98 * and the tracer calls preempt_enable_notrace() causing
99 * an infinite recursion.
100 */
101 preempt_disable_notrace();
102 prev_ctx = exception_enter();
103 preempt_enable_no_resched_notrace();
104
105 preempt_schedule();
106
107 preempt_disable_notrace();
108 exception_exit(prev_ctx);
109 preempt_enable_notrace();
110}
111EXPORT_SYMBOL_GPL(preempt_schedule_context);
112#endif /* CONFIG_PREEMPT */
74 113
75/** 114/**
76 * user_exit - Inform the context tracking that the CPU is 115 * user_exit - Inform the context tracking that the CPU is
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b5e4ab2d427e..198a38883e64 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -133,6 +133,27 @@ static void cpu_hotplug_done(void)
133 mutex_unlock(&cpu_hotplug.lock); 133 mutex_unlock(&cpu_hotplug.lock);
134} 134}
135 135
136/*
137 * Wait for currently running CPU hotplug operations to complete (if any) and
138 * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
139 * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
140 * hotplug path before performing hotplug operations. So acquiring that lock
141 * guarantees mutual exclusion from any currently running hotplug operations.
142 */
143void cpu_hotplug_disable(void)
144{
145 cpu_maps_update_begin();
146 cpu_hotplug_disabled = 1;
147 cpu_maps_update_done();
148}
149
150void cpu_hotplug_enable(void)
151{
152 cpu_maps_update_begin();
153 cpu_hotplug_disabled = 0;
154 cpu_maps_update_done();
155}
156
136#else /* #if CONFIG_HOTPLUG_CPU */ 157#else /* #if CONFIG_HOTPLUG_CPU */
137static void cpu_hotplug_begin(void) {} 158static void cpu_hotplug_begin(void) {}
138static void cpu_hotplug_done(void) {} 159static void cpu_hotplug_done(void) {}
@@ -541,36 +562,6 @@ static int __init alloc_frozen_cpus(void)
541core_initcall(alloc_frozen_cpus); 562core_initcall(alloc_frozen_cpus);
542 563
543/* 564/*
544 * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU
545 * hotplug when tasks are about to be frozen. Also, don't allow the freezer
546 * to continue until any currently running CPU hotplug operation gets
547 * completed.
548 * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the
549 * 'cpu_add_remove_lock'. And this same lock is also taken by the regular
550 * CPU hotplug path and released only after it is complete. Thus, we
551 * (and hence the freezer) will block here until any currently running CPU
552 * hotplug operation gets completed.
553 */
554void cpu_hotplug_disable_before_freeze(void)
555{
556 cpu_maps_update_begin();
557 cpu_hotplug_disabled = 1;
558 cpu_maps_update_done();
559}
560
561
562/*
563 * When tasks have been thawed, re-enable regular CPU hotplug (which had been
564 * disabled while beginning to freeze tasks).
565 */
566void cpu_hotplug_enable_after_thaw(void)
567{
568 cpu_maps_update_begin();
569 cpu_hotplug_disabled = 0;
570 cpu_maps_update_done();
571}
572
573/*
574 * When callbacks for CPU hotplug notifications are being executed, we must 565 * When callbacks for CPU hotplug notifications are being executed, we must
575 * ensure that the state of the system with respect to the tasks being frozen 566 * ensure that the state of the system with respect to the tasks being frozen
576 * or not, as reported by the notification, remains unchanged *throughout the 567 * or not, as reported by the notification, remains unchanged *throughout the
@@ -589,12 +580,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
589 580
590 case PM_SUSPEND_PREPARE: 581 case PM_SUSPEND_PREPARE:
591 case PM_HIBERNATION_PREPARE: 582 case PM_HIBERNATION_PREPARE:
592 cpu_hotplug_disable_before_freeze(); 583 cpu_hotplug_disable();
593 break; 584 break;
594 585
595 case PM_POST_SUSPEND: 586 case PM_POST_SUSPEND:
596 case PM_POST_HIBERNATION: 587 case PM_POST_HIBERNATION:
597 cpu_hotplug_enable_after_thaw(); 588 cpu_hotplug_enable();
598 break; 589 break;
599 590
600 default: 591 default:
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index d5585f5e038e..e695c0a0bcb5 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -5,6 +5,7 @@
5#include <linux/cpu.h> 5#include <linux/cpu.h>
6#include <linux/tick.h> 6#include <linux/tick.h>
7#include <linux/mm.h> 7#include <linux/mm.h>
8#include <linux/stackprotector.h>
8 9
9#include <asm/tlb.h> 10#include <asm/tlb.h>
10 11
@@ -58,6 +59,7 @@ void __weak arch_cpu_idle_dead(void) { }
58void __weak arch_cpu_idle(void) 59void __weak arch_cpu_idle(void)
59{ 60{
60 cpu_idle_force_poll = 1; 61 cpu_idle_force_poll = 1;
62 local_irq_enable();
61} 63}
62 64
63/* 65/*
@@ -112,6 +114,21 @@ static void cpu_idle_loop(void)
112 114
113void cpu_startup_entry(enum cpuhp_state state) 115void cpu_startup_entry(enum cpuhp_state state)
114{ 116{
117 /*
118 * This #ifdef needs to die, but it's too late in the cycle to
119 * make this generic (arm and sh have never invoked the canary
120 * init for the non boot cpus!). Will be fixed in 3.11
121 */
122#ifdef CONFIG_X86
123 /*
124 * If we're the non-boot CPU, nothing set the stack canary up
125 * for us. The boot CPU already has it initialized but no harm
126 * in doing it again. This is a good place for updating it, as
127 * we wont ever return from this function (so the invalid
128 * canaries already on the stack wont ever trigger).
129 */
130 boot_init_stack_canary();
131#endif
115 current_set_polling(); 132 current_set_polling();
116 arch_cpu_idle_prepare(); 133 arch_cpu_idle_prepare();
117 cpu_idle_loop(); 134 cpu_idle_loop();
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 64b3f791bbe5..e5657788fedd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -59,6 +59,7 @@
59#include <linux/mutex.h> 59#include <linux/mutex.h>
60#include <linux/workqueue.h> 60#include <linux/workqueue.h>
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
62#include <linux/wait.h>
62 63
63/* 64/*
64 * Tracks how many cpusets are currently defined in system. 65 * Tracks how many cpusets are currently defined in system.
@@ -87,6 +88,18 @@ struct cpuset {
87 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 88 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
88 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 89 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
89 90
91 /*
92 * This is old Memory Nodes tasks took on.
93 *
94 * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
95 * - A new cpuset's old_mems_allowed is initialized when some
96 * task is moved into it.
97 * - old_mems_allowed is used in cpuset_migrate_mm() when we change
98 * cpuset.mems_allowed and have tasks' nodemask updated, and
99 * then old_mems_allowed is updated to mems_allowed.
100 */
101 nodemask_t old_mems_allowed;
102
90 struct fmeter fmeter; /* memory_pressure filter */ 103 struct fmeter fmeter; /* memory_pressure filter */
91 104
92 /* 105 /*
@@ -100,14 +113,12 @@ struct cpuset {
100 113
101 /* for custom sched domain */ 114 /* for custom sched domain */
102 int relax_domain_level; 115 int relax_domain_level;
103
104 struct work_struct hotplug_work;
105}; 116};
106 117
107/* Retrieve the cpuset for a cgroup */ 118/* Retrieve the cpuset for a cgroup */
108static inline struct cpuset *cgroup_cs(struct cgroup *cont) 119static inline struct cpuset *cgroup_cs(struct cgroup *cgrp)
109{ 120{
110 return container_of(cgroup_subsys_state(cont, cpuset_subsys_id), 121 return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id),
111 struct cpuset, css); 122 struct cpuset, css);
112} 123}
113 124
@@ -267,14 +278,11 @@ static DEFINE_MUTEX(callback_mutex);
267/* 278/*
268 * CPU / memory hotplug is handled asynchronously. 279 * CPU / memory hotplug is handled asynchronously.
269 */ 280 */
270static struct workqueue_struct *cpuset_propagate_hotplug_wq;
271
272static void cpuset_hotplug_workfn(struct work_struct *work); 281static void cpuset_hotplug_workfn(struct work_struct *work);
273static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
274static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
275
276static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); 282static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
277 283
284static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
285
278/* 286/*
279 * This is ugly, but preserves the userspace API for existing cpuset 287 * This is ugly, but preserves the userspace API for existing cpuset
280 * users. If someone tries to mount the "cpuset" filesystem, we 288 * users. If someone tries to mount the "cpuset" filesystem, we
@@ -304,53 +312,38 @@ static struct file_system_type cpuset_fs_type = {
304/* 312/*
305 * Return in pmask the portion of a cpusets's cpus_allowed that 313 * Return in pmask the portion of a cpusets's cpus_allowed that
306 * are online. If none are online, walk up the cpuset hierarchy 314 * are online. If none are online, walk up the cpuset hierarchy
307 * until we find one that does have some online cpus. If we get 315 * until we find one that does have some online cpus. The top
308 * all the way to the top and still haven't found any online cpus, 316 * cpuset always has some cpus online.
309 * return cpu_online_mask. Or if passed a NULL cs from an exit'ing
310 * task, return cpu_online_mask.
311 * 317 *
312 * One way or another, we guarantee to return some non-empty subset 318 * One way or another, we guarantee to return some non-empty subset
313 * of cpu_online_mask. 319 * of cpu_online_mask.
314 * 320 *
315 * Call with callback_mutex held. 321 * Call with callback_mutex held.
316 */ 322 */
317
318static void guarantee_online_cpus(const struct cpuset *cs, 323static void guarantee_online_cpus(const struct cpuset *cs,
319 struct cpumask *pmask) 324 struct cpumask *pmask)
320{ 325{
321 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 326 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
322 cs = parent_cs(cs); 327 cs = parent_cs(cs);
323 if (cs) 328 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
324 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
325 else
326 cpumask_copy(pmask, cpu_online_mask);
327 BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
328} 329}
329 330
330/* 331/*
331 * Return in *pmask the portion of a cpusets's mems_allowed that 332 * Return in *pmask the portion of a cpusets's mems_allowed that
332 * are online, with memory. If none are online with memory, walk 333 * are online, with memory. If none are online with memory, walk
333 * up the cpuset hierarchy until we find one that does have some 334 * up the cpuset hierarchy until we find one that does have some
334 * online mems. If we get all the way to the top and still haven't 335 * online mems. The top cpuset always has some mems online.
335 * found any online mems, return node_states[N_MEMORY].
336 * 336 *
337 * One way or another, we guarantee to return some non-empty subset 337 * One way or another, we guarantee to return some non-empty subset
338 * of node_states[N_MEMORY]. 338 * of node_states[N_MEMORY].
339 * 339 *
340 * Call with callback_mutex held. 340 * Call with callback_mutex held.
341 */ 341 */
342
343static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 342static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
344{ 343{
345 while (cs && !nodes_intersects(cs->mems_allowed, 344 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
346 node_states[N_MEMORY]))
347 cs = parent_cs(cs); 345 cs = parent_cs(cs);
348 if (cs) 346 nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
349 nodes_and(*pmask, cs->mems_allowed,
350 node_states[N_MEMORY]);
351 else
352 *pmask = node_states[N_MEMORY];
353 BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY]));
354} 347}
355 348
356/* 349/*
@@ -440,7 +433,7 @@ static void free_trial_cpuset(struct cpuset *trial)
440 433
441static int validate_change(const struct cpuset *cur, const struct cpuset *trial) 434static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
442{ 435{
443 struct cgroup *cont; 436 struct cgroup *cgrp;
444 struct cpuset *c, *par; 437 struct cpuset *c, *par;
445 int ret; 438 int ret;
446 439
@@ -448,7 +441,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
448 441
449 /* Each of our child cpusets must be a subset of us */ 442 /* Each of our child cpusets must be a subset of us */
450 ret = -EBUSY; 443 ret = -EBUSY;
451 cpuset_for_each_child(c, cont, cur) 444 cpuset_for_each_child(c, cgrp, cur)
452 if (!is_cpuset_subset(c, trial)) 445 if (!is_cpuset_subset(c, trial))
453 goto out; 446 goto out;
454 447
@@ -469,7 +462,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
469 * overlap 462 * overlap
470 */ 463 */
471 ret = -EINVAL; 464 ret = -EINVAL;
472 cpuset_for_each_child(c, cont, par) { 465 cpuset_for_each_child(c, cgrp, par) {
473 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 466 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
474 c != cur && 467 c != cur &&
475 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 468 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
@@ -486,7 +479,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
486 */ 479 */
487 ret = -ENOSPC; 480 ret = -ENOSPC;
488 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && 481 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
489 (cpumask_empty(trial->cpus_allowed) || 482 (cpumask_empty(trial->cpus_allowed) &&
490 nodes_empty(trial->mems_allowed))) 483 nodes_empty(trial->mems_allowed)))
491 goto out; 484 goto out;
492 485
@@ -540,7 +533,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
540 * This function builds a partial partition of the systems CPUs 533 * This function builds a partial partition of the systems CPUs
541 * A 'partial partition' is a set of non-overlapping subsets whose 534 * A 'partial partition' is a set of non-overlapping subsets whose
542 * union is a subset of that set. 535 * union is a subset of that set.
543 * The output of this function needs to be passed to kernel/sched.c 536 * The output of this function needs to be passed to kernel/sched/core.c
544 * partition_sched_domains() routine, which will rebuild the scheduler's 537 * partition_sched_domains() routine, which will rebuild the scheduler's
545 * load balancing domains (sched domains) as specified by that partial 538 * load balancing domains (sched domains) as specified by that partial
546 * partition. 539 * partition.
@@ -569,7 +562,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
569 * is a subset of one of these domains, while there are as 562 * is a subset of one of these domains, while there are as
570 * many such domains as possible, each as small as possible. 563 * many such domains as possible, each as small as possible.
571 * doms - Conversion of 'csa' to an array of cpumasks, for passing to 564 * doms - Conversion of 'csa' to an array of cpumasks, for passing to
572 * the kernel/sched.c routine partition_sched_domains() in a 565 * the kernel/sched/core.c routine partition_sched_domains() in a
573 * convenient format, that can be easily compared to the prior 566 * convenient format, that can be easily compared to the prior
574 * value to determine what partition elements (sched domains) 567 * value to determine what partition elements (sched domains)
575 * were changed (added or removed.) 568 * were changed (added or removed.)
@@ -798,21 +791,43 @@ void rebuild_sched_domains(void)
798 mutex_unlock(&cpuset_mutex); 791 mutex_unlock(&cpuset_mutex);
799} 792}
800 793
801/** 794/*
802 * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's 795 * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
803 * @tsk: task to test 796 * @cs: the cpuset in interest
804 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
805 * 797 *
806 * Call with cpuset_mutex held. May take callback_mutex during call. 798 * A cpuset's effective cpumask is the cpumask of the nearest ancestor
807 * Called for each task in a cgroup by cgroup_scan_tasks(). 799 * with non-empty cpus. We use effective cpumask whenever:
808 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other 800 * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
809 * words, if its mask is not equal to its cpuset's mask). 801 * if the cpuset they reside in has no cpus)
802 * - we want to retrieve task_cs(tsk)'s cpus_allowed.
803 *
804 * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
805 * exception. See comments there.
810 */ 806 */
811static int cpuset_test_cpumask(struct task_struct *tsk, 807static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
812 struct cgroup_scanner *scan)
813{ 808{
814 return !cpumask_equal(&tsk->cpus_allowed, 809 while (cpumask_empty(cs->cpus_allowed))
815 (cgroup_cs(scan->cg))->cpus_allowed); 810 cs = parent_cs(cs);
811 return cs;
812}
813
814/*
815 * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
816 * @cs: the cpuset in interest
817 *
818 * A cpuset's effective nodemask is the nodemask of the nearest ancestor
819 * with non-empty memss. We use effective nodemask whenever:
820 * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
821 * if the cpuset they reside in has no mems)
822 * - we want to retrieve task_cs(tsk)'s mems_allowed.
823 *
824 * Called with cpuset_mutex held.
825 */
826static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
827{
828 while (nodes_empty(cs->mems_allowed))
829 cs = parent_cs(cs);
830 return cs;
816} 831}
817 832
818/** 833/**
@@ -829,7 +844,10 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
829static void cpuset_change_cpumask(struct task_struct *tsk, 844static void cpuset_change_cpumask(struct task_struct *tsk,
830 struct cgroup_scanner *scan) 845 struct cgroup_scanner *scan)
831{ 846{
832 set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed)); 847 struct cpuset *cpus_cs;
848
849 cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg));
850 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
833} 851}
834 852
835/** 853/**
@@ -850,12 +868,51 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
850 struct cgroup_scanner scan; 868 struct cgroup_scanner scan;
851 869
852 scan.cg = cs->css.cgroup; 870 scan.cg = cs->css.cgroup;
853 scan.test_task = cpuset_test_cpumask; 871 scan.test_task = NULL;
854 scan.process_task = cpuset_change_cpumask; 872 scan.process_task = cpuset_change_cpumask;
855 scan.heap = heap; 873 scan.heap = heap;
856 cgroup_scan_tasks(&scan); 874 cgroup_scan_tasks(&scan);
857} 875}
858 876
877/*
878 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
879 * @root_cs: the root cpuset of the hierarchy
880 * @update_root: update root cpuset or not?
881 * @heap: the heap used by cgroup_scan_tasks()
882 *
883 * This will update cpumasks of tasks in @root_cs and all other empty cpusets
884 * which take on cpumask of @root_cs.
885 *
886 * Called with cpuset_mutex held
887 */
888static void update_tasks_cpumask_hier(struct cpuset *root_cs,
889 bool update_root, struct ptr_heap *heap)
890{
891 struct cpuset *cp;
892 struct cgroup *pos_cgrp;
893
894 if (update_root)
895 update_tasks_cpumask(root_cs, heap);
896
897 rcu_read_lock();
898 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
899 /* skip the whole subtree if @cp have some CPU */
900 if (!cpumask_empty(cp->cpus_allowed)) {
901 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
902 continue;
903 }
904 if (!css_tryget(&cp->css))
905 continue;
906 rcu_read_unlock();
907
908 update_tasks_cpumask(cp, heap);
909
910 rcu_read_lock();
911 css_put(&cp->css);
912 }
913 rcu_read_unlock();
914}
915
859/** 916/**
860 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 917 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
861 * @cs: the cpuset to consider 918 * @cs: the cpuset to consider
@@ -888,14 +945,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
888 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) 945 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
889 return -EINVAL; 946 return -EINVAL;
890 } 947 }
891 retval = validate_change(cs, trialcs);
892 if (retval < 0)
893 return retval;
894 948
895 /* Nothing to do if the cpus didn't change */ 949 /* Nothing to do if the cpus didn't change */
896 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) 950 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
897 return 0; 951 return 0;
898 952
953 retval = validate_change(cs, trialcs);
954 if (retval < 0)
955 return retval;
956
899 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); 957 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
900 if (retval) 958 if (retval)
901 return retval; 959 return retval;
@@ -906,11 +964,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
906 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 964 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
907 mutex_unlock(&callback_mutex); 965 mutex_unlock(&callback_mutex);
908 966
909 /* 967 update_tasks_cpumask_hier(cs, true, &heap);
910 * Scan tasks in the cpuset, and update the cpumasks of any
911 * that need an update.
912 */
913 update_tasks_cpumask(cs, &heap);
914 968
915 heap_free(&heap); 969 heap_free(&heap);
916 970
@@ -943,12 +997,14 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
943 const nodemask_t *to) 997 const nodemask_t *to)
944{ 998{
945 struct task_struct *tsk = current; 999 struct task_struct *tsk = current;
1000 struct cpuset *mems_cs;
946 1001
947 tsk->mems_allowed = *to; 1002 tsk->mems_allowed = *to;
948 1003
949 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 1004 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
950 1005
951 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); 1006 mems_cs = effective_nodemask_cpuset(task_cs(tsk));
1007 guarantee_online_mems(mems_cs, &tsk->mems_allowed);
952} 1008}
953 1009
954/* 1010/*
@@ -1007,16 +1063,12 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1007static void cpuset_change_nodemask(struct task_struct *p, 1063static void cpuset_change_nodemask(struct task_struct *p,
1008 struct cgroup_scanner *scan) 1064 struct cgroup_scanner *scan)
1009{ 1065{
1066 struct cpuset *cs = cgroup_cs(scan->cg);
1010 struct mm_struct *mm; 1067 struct mm_struct *mm;
1011 struct cpuset *cs;
1012 int migrate; 1068 int migrate;
1013 const nodemask_t *oldmem = scan->data; 1069 nodemask_t *newmems = scan->data;
1014 static nodemask_t newmems; /* protected by cpuset_mutex */
1015
1016 cs = cgroup_cs(scan->cg);
1017 guarantee_online_mems(cs, &newmems);
1018 1070
1019 cpuset_change_task_nodemask(p, &newmems); 1071 cpuset_change_task_nodemask(p, newmems);
1020 1072
1021 mm = get_task_mm(p); 1073 mm = get_task_mm(p);
1022 if (!mm) 1074 if (!mm)
@@ -1026,7 +1078,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
1026 1078
1027 mpol_rebind_mm(mm, &cs->mems_allowed); 1079 mpol_rebind_mm(mm, &cs->mems_allowed);
1028 if (migrate) 1080 if (migrate)
1029 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); 1081 cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems);
1030 mmput(mm); 1082 mmput(mm);
1031} 1083}
1032 1084
@@ -1035,25 +1087,27 @@ static void *cpuset_being_rebound;
1035/** 1087/**
1036 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1088 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1037 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1089 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1038 * @oldmem: old mems_allowed of cpuset cs
1039 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1090 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1040 * 1091 *
1041 * Called with cpuset_mutex held 1092 * Called with cpuset_mutex held
1042 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1093 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1043 * if @heap != NULL. 1094 * if @heap != NULL.
1044 */ 1095 */
1045static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, 1096static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1046 struct ptr_heap *heap)
1047{ 1097{
1098 static nodemask_t newmems; /* protected by cpuset_mutex */
1048 struct cgroup_scanner scan; 1099 struct cgroup_scanner scan;
1100 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1049 1101
1050 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1102 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1051 1103
1104 guarantee_online_mems(mems_cs, &newmems);
1105
1052 scan.cg = cs->css.cgroup; 1106 scan.cg = cs->css.cgroup;
1053 scan.test_task = NULL; 1107 scan.test_task = NULL;
1054 scan.process_task = cpuset_change_nodemask; 1108 scan.process_task = cpuset_change_nodemask;
1055 scan.heap = heap; 1109 scan.heap = heap;
1056 scan.data = (nodemask_t *)oldmem; 1110 scan.data = &newmems;
1057 1111
1058 /* 1112 /*
1059 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't 1113 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
@@ -1067,11 +1121,56 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1067 */ 1121 */
1068 cgroup_scan_tasks(&scan); 1122 cgroup_scan_tasks(&scan);
1069 1123
1124 /*
1125 * All the tasks' nodemasks have been updated, update
1126 * cs->old_mems_allowed.
1127 */
1128 cs->old_mems_allowed = newmems;
1129
1070 /* We're done rebinding vmas to this cpuset's new mems_allowed. */ 1130 /* We're done rebinding vmas to this cpuset's new mems_allowed. */
1071 cpuset_being_rebound = NULL; 1131 cpuset_being_rebound = NULL;
1072} 1132}
1073 1133
1074/* 1134/*
1135 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
1136 * @cs: the root cpuset of the hierarchy
1137 * @update_root: update the root cpuset or not?
1138 * @heap: the heap used by cgroup_scan_tasks()
1139 *
1140 * This will update nodemasks of tasks in @root_cs and all other empty cpusets
1141 * which take on nodemask of @root_cs.
1142 *
1143 * Called with cpuset_mutex held
1144 */
1145static void update_tasks_nodemask_hier(struct cpuset *root_cs,
1146 bool update_root, struct ptr_heap *heap)
1147{
1148 struct cpuset *cp;
1149 struct cgroup *pos_cgrp;
1150
1151 if (update_root)
1152 update_tasks_nodemask(root_cs, heap);
1153
1154 rcu_read_lock();
1155 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
1156 /* skip the whole subtree if @cp have some CPU */
1157 if (!nodes_empty(cp->mems_allowed)) {
1158 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
1159 continue;
1160 }
1161 if (!css_tryget(&cp->css))
1162 continue;
1163 rcu_read_unlock();
1164
1165 update_tasks_nodemask(cp, heap);
1166
1167 rcu_read_lock();
1168 css_put(&cp->css);
1169 }
1170 rcu_read_unlock();
1171}
1172
1173/*
1075 * Handle user request to change the 'mems' memory placement 1174 * Handle user request to change the 'mems' memory placement
1076 * of a cpuset. Needs to validate the request, update the 1175 * of a cpuset. Needs to validate the request, update the
1077 * cpusets mems_allowed, and for each task in the cpuset, 1176 * cpusets mems_allowed, and for each task in the cpuset,
@@ -1087,13 +1186,9 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1087static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, 1186static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1088 const char *buf) 1187 const char *buf)
1089{ 1188{
1090 NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
1091 int retval; 1189 int retval;
1092 struct ptr_heap heap; 1190 struct ptr_heap heap;
1093 1191
1094 if (!oldmem)
1095 return -ENOMEM;
1096
1097 /* 1192 /*
1098 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; 1193 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
1099 * it's read-only 1194 * it's read-only
@@ -1122,8 +1217,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1122 goto done; 1217 goto done;
1123 } 1218 }
1124 } 1219 }
1125 *oldmem = cs->mems_allowed; 1220
1126 if (nodes_equal(*oldmem, trialcs->mems_allowed)) { 1221 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1127 retval = 0; /* Too easy - nothing to do */ 1222 retval = 0; /* Too easy - nothing to do */
1128 goto done; 1223 goto done;
1129 } 1224 }
@@ -1139,11 +1234,10 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1139 cs->mems_allowed = trialcs->mems_allowed; 1234 cs->mems_allowed = trialcs->mems_allowed;
1140 mutex_unlock(&callback_mutex); 1235 mutex_unlock(&callback_mutex);
1141 1236
1142 update_tasks_nodemask(cs, oldmem, &heap); 1237 update_tasks_nodemask_hier(cs, true, &heap);
1143 1238
1144 heap_free(&heap); 1239 heap_free(&heap);
1145done: 1240done:
1146 NODEMASK_FREE(oldmem);
1147 return retval; 1241 return retval;
1148} 1242}
1149 1243
@@ -1372,8 +1466,13 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1372 1466
1373 mutex_lock(&cpuset_mutex); 1467 mutex_lock(&cpuset_mutex);
1374 1468
1469 /*
1470 * We allow to move tasks into an empty cpuset if sane_behavior
1471 * flag is set.
1472 */
1375 ret = -ENOSPC; 1473 ret = -ENOSPC;
1376 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1474 if (!cgroup_sane_behavior(cgrp) &&
1475 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1377 goto out_unlock; 1476 goto out_unlock;
1378 1477
1379 cgroup_taskset_for_each(task, cgrp, tset) { 1478 cgroup_taskset_for_each(task, cgrp, tset) {
@@ -1422,8 +1521,7 @@ static cpumask_var_t cpus_attach;
1422 1521
1423static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1522static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1424{ 1523{
1425 /* static bufs protected by cpuset_mutex */ 1524 /* static buf protected by cpuset_mutex */
1426 static nodemask_t cpuset_attach_nodemask_from;
1427 static nodemask_t cpuset_attach_nodemask_to; 1525 static nodemask_t cpuset_attach_nodemask_to;
1428 struct mm_struct *mm; 1526 struct mm_struct *mm;
1429 struct task_struct *task; 1527 struct task_struct *task;
@@ -1431,6 +1529,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1431 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); 1529 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
1432 struct cpuset *cs = cgroup_cs(cgrp); 1530 struct cpuset *cs = cgroup_cs(cgrp);
1433 struct cpuset *oldcs = cgroup_cs(oldcgrp); 1531 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1532 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1533 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1434 1534
1435 mutex_lock(&cpuset_mutex); 1535 mutex_lock(&cpuset_mutex);
1436 1536
@@ -1438,9 +1538,9 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1438 if (cs == &top_cpuset) 1538 if (cs == &top_cpuset)
1439 cpumask_copy(cpus_attach, cpu_possible_mask); 1539 cpumask_copy(cpus_attach, cpu_possible_mask);
1440 else 1540 else
1441 guarantee_online_cpus(cs, cpus_attach); 1541 guarantee_online_cpus(cpus_cs, cpus_attach);
1442 1542
1443 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 1543 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
1444 1544
1445 cgroup_taskset_for_each(task, cgrp, tset) { 1545 cgroup_taskset_for_each(task, cgrp, tset) {
1446 /* 1546 /*
@@ -1457,26 +1557,32 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1457 * Change mm, possibly for multiple threads in a threadgroup. This is 1557 * Change mm, possibly for multiple threads in a threadgroup. This is
1458 * expensive and may sleep. 1558 * expensive and may sleep.
1459 */ 1559 */
1460 cpuset_attach_nodemask_from = oldcs->mems_allowed;
1461 cpuset_attach_nodemask_to = cs->mems_allowed; 1560 cpuset_attach_nodemask_to = cs->mems_allowed;
1462 mm = get_task_mm(leader); 1561 mm = get_task_mm(leader);
1463 if (mm) { 1562 if (mm) {
1563 struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
1564
1464 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 1565 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1465 if (is_memory_migrate(cs)) 1566
1466 cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, 1567 /*
1568 * old_mems_allowed is the same with mems_allowed here, except
1569 * if this task is being moved automatically due to hotplug.
1570 * In that case @mems_allowed has been updated and is empty,
1571 * so @old_mems_allowed is the right nodesets that we migrate
1572 * mm from.
1573 */
1574 if (is_memory_migrate(cs)) {
1575 cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
1467 &cpuset_attach_nodemask_to); 1576 &cpuset_attach_nodemask_to);
1577 }
1468 mmput(mm); 1578 mmput(mm);
1469 } 1579 }
1470 1580
1471 cs->attach_in_progress--; 1581 cs->old_mems_allowed = cpuset_attach_nodemask_to;
1472 1582
1473 /* 1583 cs->attach_in_progress--;
1474 * We may have raced with CPU/memory hotunplug. Trigger hotplug 1584 if (!cs->attach_in_progress)
1475 * propagation if @cs doesn't have any CPU or memory. It will move 1585 wake_up(&cpuset_attach_wq);
1476 * the newly added tasks to the nearest parent which can execute.
1477 */
1478 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1479 schedule_cpuset_propagate_hotplug(cs);
1480 1586
1481 mutex_unlock(&cpuset_mutex); 1587 mutex_unlock(&cpuset_mutex);
1482} 1588}
@@ -1588,13 +1694,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1588 * resources, wait for the previously scheduled operations before 1694 * resources, wait for the previously scheduled operations before
1589 * proceeding, so that we don't end up keep removing tasks added 1695 * proceeding, so that we don't end up keep removing tasks added
1590 * after execution capability is restored. 1696 * after execution capability is restored.
1591 *
1592 * Flushing cpuset_hotplug_work is enough to synchronize against
1593 * hotplug hanlding; however, cpuset_attach() may schedule
1594 * propagation work directly. Flush the workqueue too.
1595 */ 1697 */
1596 flush_work(&cpuset_hotplug_work); 1698 flush_work(&cpuset_hotplug_work);
1597 flush_workqueue(cpuset_propagate_hotplug_wq);
1598 1699
1599 mutex_lock(&cpuset_mutex); 1700 mutex_lock(&cpuset_mutex);
1600 if (!is_cpuset_online(cs)) 1701 if (!is_cpuset_online(cs))
@@ -1658,13 +1759,13 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1658 return count; 1759 return count;
1659} 1760}
1660 1761
1661static ssize_t cpuset_common_file_read(struct cgroup *cont, 1762static ssize_t cpuset_common_file_read(struct cgroup *cgrp,
1662 struct cftype *cft, 1763 struct cftype *cft,
1663 struct file *file, 1764 struct file *file,
1664 char __user *buf, 1765 char __user *buf,
1665 size_t nbytes, loff_t *ppos) 1766 size_t nbytes, loff_t *ppos)
1666{ 1767{
1667 struct cpuset *cs = cgroup_cs(cont); 1768 struct cpuset *cs = cgroup_cs(cgrp);
1668 cpuset_filetype_t type = cft->private; 1769 cpuset_filetype_t type = cft->private;
1669 char *page; 1770 char *page;
1670 ssize_t retval = 0; 1771 ssize_t retval = 0;
@@ -1694,9 +1795,9 @@ out:
1694 return retval; 1795 return retval;
1695} 1796}
1696 1797
1697static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) 1798static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft)
1698{ 1799{
1699 struct cpuset *cs = cgroup_cs(cont); 1800 struct cpuset *cs = cgroup_cs(cgrp);
1700 cpuset_filetype_t type = cft->private; 1801 cpuset_filetype_t type = cft->private;
1701 switch (type) { 1802 switch (type) {
1702 case FILE_CPU_EXCLUSIVE: 1803 case FILE_CPU_EXCLUSIVE:
@@ -1725,9 +1826,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1725 return 0; 1826 return 0;
1726} 1827}
1727 1828
1728static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) 1829static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft)
1729{ 1830{
1730 struct cpuset *cs = cgroup_cs(cont); 1831 struct cpuset *cs = cgroup_cs(cgrp);
1731 cpuset_filetype_t type = cft->private; 1832 cpuset_filetype_t type = cft->private;
1732 switch (type) { 1833 switch (type) {
1733 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1834 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1839,14 +1940,14 @@ static struct cftype files[] = {
1839 1940
1840/* 1941/*
1841 * cpuset_css_alloc - allocate a cpuset css 1942 * cpuset_css_alloc - allocate a cpuset css
1842 * cont: control group that the new cpuset will be part of 1943 * cgrp: control group that the new cpuset will be part of
1843 */ 1944 */
1844 1945
1845static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) 1946static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp)
1846{ 1947{
1847 struct cpuset *cs; 1948 struct cpuset *cs;
1848 1949
1849 if (!cont->parent) 1950 if (!cgrp->parent)
1850 return &top_cpuset.css; 1951 return &top_cpuset.css;
1851 1952
1852 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 1953 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
@@ -1861,7 +1962,6 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1861 cpumask_clear(cs->cpus_allowed); 1962 cpumask_clear(cs->cpus_allowed);
1862 nodes_clear(cs->mems_allowed); 1963 nodes_clear(cs->mems_allowed);
1863 fmeter_init(&cs->fmeter); 1964 fmeter_init(&cs->fmeter);
1864 INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
1865 cs->relax_domain_level = -1; 1965 cs->relax_domain_level = -1;
1866 1966
1867 return &cs->css; 1967 return &cs->css;
@@ -1942,9 +2042,9 @@ static void cpuset_css_offline(struct cgroup *cgrp)
1942 * will call rebuild_sched_domains_locked(). 2042 * will call rebuild_sched_domains_locked().
1943 */ 2043 */
1944 2044
1945static void cpuset_css_free(struct cgroup *cont) 2045static void cpuset_css_free(struct cgroup *cgrp)
1946{ 2046{
1947 struct cpuset *cs = cgroup_cs(cont); 2047 struct cpuset *cs = cgroup_cs(cgrp);
1948 2048
1949 free_cpumask_var(cs->cpus_allowed); 2049 free_cpumask_var(cs->cpus_allowed);
1950 kfree(cs); 2050 kfree(cs);
@@ -2024,41 +2124,64 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2024} 2124}
2025 2125
2026/** 2126/**
2027 * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset 2127 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
2028 * @cs: cpuset in interest 2128 * @cs: cpuset in interest
2029 * 2129 *
2030 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone 2130 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
2031 * offline, update @cs accordingly. If @cs ends up with no CPU or memory, 2131 * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
2032 * all its tasks are moved to the nearest ancestor with both resources. 2132 * all its tasks are moved to the nearest ancestor with both resources.
2033 */ 2133 */
2034static void cpuset_propagate_hotplug_workfn(struct work_struct *work) 2134static void cpuset_hotplug_update_tasks(struct cpuset *cs)
2035{ 2135{
2036 static cpumask_t off_cpus; 2136 static cpumask_t off_cpus;
2037 static nodemask_t off_mems, tmp_mems; 2137 static nodemask_t off_mems;
2038 struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
2039 bool is_empty; 2138 bool is_empty;
2139 bool sane = cgroup_sane_behavior(cs->css.cgroup);
2140
2141retry:
2142 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
2040 2143
2041 mutex_lock(&cpuset_mutex); 2144 mutex_lock(&cpuset_mutex);
2042 2145
2146 /*
2147 * We have raced with task attaching. We wait until attaching
2148 * is finished, so we won't attach a task to an empty cpuset.
2149 */
2150 if (cs->attach_in_progress) {
2151 mutex_unlock(&cpuset_mutex);
2152 goto retry;
2153 }
2154
2043 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); 2155 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
2044 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); 2156 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
2045 2157
2046 /* remove offline cpus from @cs */ 2158 mutex_lock(&callback_mutex);
2047 if (!cpumask_empty(&off_cpus)) { 2159 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
2048 mutex_lock(&callback_mutex); 2160 mutex_unlock(&callback_mutex);
2049 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); 2161
2050 mutex_unlock(&callback_mutex); 2162 /*
2163 * If sane_behavior flag is set, we need to update tasks' cpumask
2164 * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
2165 * call update_tasks_cpumask() if the cpuset becomes empty, as
2166 * the tasks in it will be migrated to an ancestor.
2167 */
2168 if ((sane && cpumask_empty(cs->cpus_allowed)) ||
2169 (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
2051 update_tasks_cpumask(cs, NULL); 2170 update_tasks_cpumask(cs, NULL);
2052 }
2053 2171
2054 /* remove offline mems from @cs */ 2172 mutex_lock(&callback_mutex);
2055 if (!nodes_empty(off_mems)) { 2173 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
2056 tmp_mems = cs->mems_allowed; 2174 mutex_unlock(&callback_mutex);
2057 mutex_lock(&callback_mutex); 2175
2058 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); 2176 /*
2059 mutex_unlock(&callback_mutex); 2177 * If sane_behavior flag is set, we need to update tasks' nodemask
2060 update_tasks_nodemask(cs, &tmp_mems, NULL); 2178 * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
2061 } 2179 * call update_tasks_nodemask() if the cpuset becomes empty, as
2180 * the tasks in it will be migratd to an ancestor.
2181 */
2182 if ((sane && nodes_empty(cs->mems_allowed)) ||
2183 (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
2184 update_tasks_nodemask(cs, NULL);
2062 2185
2063 is_empty = cpumask_empty(cs->cpus_allowed) || 2186 is_empty = cpumask_empty(cs->cpus_allowed) ||
2064 nodes_empty(cs->mems_allowed); 2187 nodes_empty(cs->mems_allowed);
@@ -2066,40 +2189,14 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
2066 mutex_unlock(&cpuset_mutex); 2189 mutex_unlock(&cpuset_mutex);
2067 2190
2068 /* 2191 /*
2069 * If @cs became empty, move tasks to the nearest ancestor with 2192 * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
2070 * execution resources. This is full cgroup operation which will 2193 *
2194 * Otherwise move tasks to the nearest ancestor with execution
2195 * resources. This is full cgroup operation which will
2071 * also call back into cpuset. Should be done outside any lock. 2196 * also call back into cpuset. Should be done outside any lock.
2072 */ 2197 */
2073 if (is_empty) 2198 if (!sane && is_empty)
2074 remove_tasks_in_empty_cpuset(cs); 2199 remove_tasks_in_empty_cpuset(cs);
2075
2076 /* the following may free @cs, should be the last operation */
2077 css_put(&cs->css);
2078}
2079
2080/**
2081 * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
2082 * @cs: cpuset of interest
2083 *
2084 * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
2085 * memory masks according to top_cpuset.
2086 */
2087static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
2088{
2089 /*
2090 * Pin @cs. The refcnt will be released when the work item
2091 * finishes executing.
2092 */
2093 if (!css_tryget(&cs->css))
2094 return;
2095
2096 /*
2097 * Queue @cs->hotplug_work. If already pending, lose the css ref.
2098 * cpuset_propagate_hotplug_wq is ordered and propagation will
2099 * happen in the order this function is called.
2100 */
2101 if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
2102 css_put(&cs->css);
2103} 2200}
2104 2201
2105/** 2202/**
@@ -2112,18 +2209,17 @@ static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
2112 * actively using CPU hotplug but making no active use of cpusets. 2209 * actively using CPU hotplug but making no active use of cpusets.
2113 * 2210 *
2114 * Non-root cpusets are only affected by offlining. If any CPUs or memory 2211 * Non-root cpusets are only affected by offlining. If any CPUs or memory
2115 * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all 2212 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
2116 * descendants. 2213 * all descendants.
2117 * 2214 *
2118 * Note that CPU offlining during suspend is ignored. We don't modify 2215 * Note that CPU offlining during suspend is ignored. We don't modify
2119 * cpusets across suspend/resume cycles at all. 2216 * cpusets across suspend/resume cycles at all.
2120 */ 2217 */
2121static void cpuset_hotplug_workfn(struct work_struct *work) 2218static void cpuset_hotplug_workfn(struct work_struct *work)
2122{ 2219{
2123 static cpumask_t new_cpus, tmp_cpus; 2220 static cpumask_t new_cpus;
2124 static nodemask_t new_mems, tmp_mems; 2221 static nodemask_t new_mems;
2125 bool cpus_updated, mems_updated; 2222 bool cpus_updated, mems_updated;
2126 bool cpus_offlined, mems_offlined;
2127 2223
2128 mutex_lock(&cpuset_mutex); 2224 mutex_lock(&cpuset_mutex);
2129 2225
@@ -2132,12 +2228,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2132 new_mems = node_states[N_MEMORY]; 2228 new_mems = node_states[N_MEMORY];
2133 2229
2134 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); 2230 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
2135 cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
2136 &new_cpus);
2137
2138 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); 2231 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
2139 nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
2140 mems_offlined = !nodes_empty(tmp_mems);
2141 2232
2142 /* synchronize cpus_allowed to cpu_active_mask */ 2233 /* synchronize cpus_allowed to cpu_active_mask */
2143 if (cpus_updated) { 2234 if (cpus_updated) {
@@ -2149,28 +2240,32 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2149 2240
2150 /* synchronize mems_allowed to N_MEMORY */ 2241 /* synchronize mems_allowed to N_MEMORY */
2151 if (mems_updated) { 2242 if (mems_updated) {
2152 tmp_mems = top_cpuset.mems_allowed;
2153 mutex_lock(&callback_mutex); 2243 mutex_lock(&callback_mutex);
2154 top_cpuset.mems_allowed = new_mems; 2244 top_cpuset.mems_allowed = new_mems;
2155 mutex_unlock(&callback_mutex); 2245 mutex_unlock(&callback_mutex);
2156 update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); 2246 update_tasks_nodemask(&top_cpuset, NULL);
2157 } 2247 }
2158 2248
2159 /* if cpus or mems went down, we need to propagate to descendants */ 2249 mutex_unlock(&cpuset_mutex);
2160 if (cpus_offlined || mems_offlined) { 2250
2251 /* if cpus or mems changed, we need to propagate to descendants */
2252 if (cpus_updated || mems_updated) {
2161 struct cpuset *cs; 2253 struct cpuset *cs;
2162 struct cgroup *pos_cgrp; 2254 struct cgroup *pos_cgrp;
2163 2255
2164 rcu_read_lock(); 2256 rcu_read_lock();
2165 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) 2257 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) {
2166 schedule_cpuset_propagate_hotplug(cs); 2258 if (!css_tryget(&cs->css))
2167 rcu_read_unlock(); 2259 continue;
2168 } 2260 rcu_read_unlock();
2169 2261
2170 mutex_unlock(&cpuset_mutex); 2262 cpuset_hotplug_update_tasks(cs);
2171 2263
2172 /* wait for propagations to finish */ 2264 rcu_read_lock();
2173 flush_workqueue(cpuset_propagate_hotplug_wq); 2265 css_put(&cs->css);
2266 }
2267 rcu_read_unlock();
2268 }
2174 2269
2175 /* rebuild sched domains if cpus_allowed has changed */ 2270 /* rebuild sched domains if cpus_allowed has changed */
2176 if (cpus_updated) 2271 if (cpus_updated)
@@ -2219,12 +2314,9 @@ void __init cpuset_init_smp(void)
2219{ 2314{
2220 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2315 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2221 top_cpuset.mems_allowed = node_states[N_MEMORY]; 2316 top_cpuset.mems_allowed = node_states[N_MEMORY];
2317 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
2222 2318
2223 register_hotmemory_notifier(&cpuset_track_online_nodes_nb); 2319 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2224
2225 cpuset_propagate_hotplug_wq =
2226 alloc_ordered_workqueue("cpuset_hotplug", 0);
2227 BUG_ON(!cpuset_propagate_hotplug_wq);
2228} 2320}
2229 2321
2230/** 2322/**
@@ -2240,21 +2332,23 @@ void __init cpuset_init_smp(void)
2240 2332
2241void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2333void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2242{ 2334{
2335 struct cpuset *cpus_cs;
2336
2243 mutex_lock(&callback_mutex); 2337 mutex_lock(&callback_mutex);
2244 task_lock(tsk); 2338 task_lock(tsk);
2245 guarantee_online_cpus(task_cs(tsk), pmask); 2339 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
2340 guarantee_online_cpus(cpus_cs, pmask);
2246 task_unlock(tsk); 2341 task_unlock(tsk);
2247 mutex_unlock(&callback_mutex); 2342 mutex_unlock(&callback_mutex);
2248} 2343}
2249 2344
2250void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2345void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2251{ 2346{
2252 const struct cpuset *cs; 2347 const struct cpuset *cpus_cs;
2253 2348
2254 rcu_read_lock(); 2349 rcu_read_lock();
2255 cs = task_cs(tsk); 2350 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
2256 if (cs) 2351 do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
2257 do_set_cpus_allowed(tsk, cs->cpus_allowed);
2258 rcu_read_unlock(); 2352 rcu_read_unlock();
2259 2353
2260 /* 2354 /*
@@ -2293,11 +2387,13 @@ void cpuset_init_current_mems_allowed(void)
2293 2387
2294nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 2388nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2295{ 2389{
2390 struct cpuset *mems_cs;
2296 nodemask_t mask; 2391 nodemask_t mask;
2297 2392
2298 mutex_lock(&callback_mutex); 2393 mutex_lock(&callback_mutex);
2299 task_lock(tsk); 2394 task_lock(tsk);
2300 guarantee_online_mems(task_cs(tsk), &mask); 2395 mems_cs = effective_nodemask_cpuset(task_cs(tsk));
2396 guarantee_online_mems(mems_cs, &mask);
2301 task_unlock(tsk); 2397 task_unlock(tsk);
2302 mutex_unlock(&callback_mutex); 2398 mutex_unlock(&callback_mutex);
2303 2399
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9dc297faf7c0..1db3af933704 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -165,10 +165,28 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
165/* 165/*
166 * max perf event sample rate 166 * max perf event sample rate
167 */ 167 */
168#define DEFAULT_MAX_SAMPLE_RATE 100000 168#define DEFAULT_MAX_SAMPLE_RATE 100000
169int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; 169#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
170static int max_samples_per_tick __read_mostly = 170#define DEFAULT_CPU_TIME_MAX_PERCENT 25
171 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); 171
172int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
173
174static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
175static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
176
177static atomic_t perf_sample_allowed_ns __read_mostly =
178 ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
179
180void update_perf_cpu_limits(void)
181{
182 u64 tmp = perf_sample_period_ns;
183
184 tmp *= sysctl_perf_cpu_time_max_percent;
185 tmp = do_div(tmp, 100);
186 atomic_set(&perf_sample_allowed_ns, tmp);
187}
188
189static int perf_rotate_context(struct perf_cpu_context *cpuctx);
172 190
173int perf_proc_update_handler(struct ctl_table *table, int write, 191int perf_proc_update_handler(struct ctl_table *table, int write,
174 void __user *buffer, size_t *lenp, 192 void __user *buffer, size_t *lenp,
@@ -180,10 +198,78 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
180 return ret; 198 return ret;
181 199
182 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); 200 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
201 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
202 update_perf_cpu_limits();
183 203
184 return 0; 204 return 0;
185} 205}
186 206
207int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
208
209int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
210 void __user *buffer, size_t *lenp,
211 loff_t *ppos)
212{
213 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
214
215 if (ret || !write)
216 return ret;
217
218 update_perf_cpu_limits();
219
220 return 0;
221}
222
223/*
224 * perf samples are done in some very critical code paths (NMIs).
225 * If they take too much CPU time, the system can lock up and not
226 * get any real work done. This will drop the sample rate when
227 * we detect that events are taking too long.
228 */
229#define NR_ACCUMULATED_SAMPLES 128
230DEFINE_PER_CPU(u64, running_sample_length);
231
232void perf_sample_event_took(u64 sample_len_ns)
233{
234 u64 avg_local_sample_len;
235 u64 local_samples_len = __get_cpu_var(running_sample_length);
236
237 if (atomic_read(&perf_sample_allowed_ns) == 0)
238 return;
239
240 /* decay the counter by 1 average sample */
241 local_samples_len = __get_cpu_var(running_sample_length);
242 local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
243 local_samples_len += sample_len_ns;
244 __get_cpu_var(running_sample_length) = local_samples_len;
245
246 /*
247 * note: this will be biased artifically low until we have
248 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
249 * from having to maintain a count.
250 */
251 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
252
253 if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
254 return;
255
256 if (max_samples_per_tick <= 1)
257 return;
258
259 max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
260 sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
261 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
262
263 printk_ratelimited(KERN_WARNING
264 "perf samples too long (%lld > %d), lowering "
265 "kernel.perf_event_max_sample_rate to %d\n",
266 avg_local_sample_len,
267 atomic_read(&perf_sample_allowed_ns),
268 sysctl_perf_event_sample_rate);
269
270 update_perf_cpu_limits();
271}
272
187static atomic64_t perf_event_id; 273static atomic64_t perf_event_id;
188 274
189static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 275static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
@@ -196,9 +282,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
196static void update_context_time(struct perf_event_context *ctx); 282static void update_context_time(struct perf_event_context *ctx);
197static u64 perf_event_time(struct perf_event *event); 283static u64 perf_event_time(struct perf_event *event);
198 284
199static void ring_buffer_attach(struct perf_event *event,
200 struct ring_buffer *rb);
201
202void __weak perf_event_print_debug(void) { } 285void __weak perf_event_print_debug(void) { }
203 286
204extern __weak const char *perf_pmu_name(void) 287extern __weak const char *perf_pmu_name(void)
@@ -658,6 +741,106 @@ perf_cgroup_mark_enabled(struct perf_event *event,
658} 741}
659#endif 742#endif
660 743
744/*
745 * set default to be dependent on timer tick just
746 * like original code
747 */
748#define PERF_CPU_HRTIMER (1000 / HZ)
749/*
750 * function must be called with interrupts disbled
751 */
752static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
753{
754 struct perf_cpu_context *cpuctx;
755 enum hrtimer_restart ret = HRTIMER_NORESTART;
756 int rotations = 0;
757
758 WARN_ON(!irqs_disabled());
759
760 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
761
762 rotations = perf_rotate_context(cpuctx);
763
764 /*
765 * arm timer if needed
766 */
767 if (rotations) {
768 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
769 ret = HRTIMER_RESTART;
770 }
771
772 return ret;
773}
774
775/* CPU is going down */
776void perf_cpu_hrtimer_cancel(int cpu)
777{
778 struct perf_cpu_context *cpuctx;
779 struct pmu *pmu;
780 unsigned long flags;
781
782 if (WARN_ON(cpu != smp_processor_id()))
783 return;
784
785 local_irq_save(flags);
786
787 rcu_read_lock();
788
789 list_for_each_entry_rcu(pmu, &pmus, entry) {
790 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
791
792 if (pmu->task_ctx_nr == perf_sw_context)
793 continue;
794
795 hrtimer_cancel(&cpuctx->hrtimer);
796 }
797
798 rcu_read_unlock();
799
800 local_irq_restore(flags);
801}
802
803static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
804{
805 struct hrtimer *hr = &cpuctx->hrtimer;
806 struct pmu *pmu = cpuctx->ctx.pmu;
807 int timer;
808
809 /* no multiplexing needed for SW PMU */
810 if (pmu->task_ctx_nr == perf_sw_context)
811 return;
812
813 /*
814 * check default is sane, if not set then force to
815 * default interval (1/tick)
816 */
817 timer = pmu->hrtimer_interval_ms;
818 if (timer < 1)
819 timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
820
821 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
822
823 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
824 hr->function = perf_cpu_hrtimer_handler;
825}
826
827static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
828{
829 struct hrtimer *hr = &cpuctx->hrtimer;
830 struct pmu *pmu = cpuctx->ctx.pmu;
831
832 /* not for SW PMU */
833 if (pmu->task_ctx_nr == perf_sw_context)
834 return;
835
836 if (hrtimer_active(hr))
837 return;
838
839 if (!hrtimer_callback_running(hr))
840 __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
841 0, HRTIMER_MODE_REL_PINNED, 0);
842}
843
661void perf_pmu_disable(struct pmu *pmu) 844void perf_pmu_disable(struct pmu *pmu)
662{ 845{
663 int *count = this_cpu_ptr(pmu->pmu_disable_count); 846 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -1506,6 +1689,7 @@ group_sched_in(struct perf_event *group_event,
1506 1689
1507 if (event_sched_in(group_event, cpuctx, ctx)) { 1690 if (event_sched_in(group_event, cpuctx, ctx)) {
1508 pmu->cancel_txn(pmu); 1691 pmu->cancel_txn(pmu);
1692 perf_cpu_hrtimer_restart(cpuctx);
1509 return -EAGAIN; 1693 return -EAGAIN;
1510 } 1694 }
1511 1695
@@ -1552,6 +1736,8 @@ group_error:
1552 1736
1553 pmu->cancel_txn(pmu); 1737 pmu->cancel_txn(pmu);
1554 1738
1739 perf_cpu_hrtimer_restart(cpuctx);
1740
1555 return -EAGAIN; 1741 return -EAGAIN;
1556} 1742}
1557 1743
@@ -1807,8 +1993,10 @@ static int __perf_event_enable(void *info)
1807 * If this event can't go on and it's part of a 1993 * If this event can't go on and it's part of a
1808 * group, then the whole group has to come off. 1994 * group, then the whole group has to come off.
1809 */ 1995 */
1810 if (leader != event) 1996 if (leader != event) {
1811 group_sched_out(leader, cpuctx, ctx); 1997 group_sched_out(leader, cpuctx, ctx);
1998 perf_cpu_hrtimer_restart(cpuctx);
1999 }
1812 if (leader->attr.pinned) { 2000 if (leader->attr.pinned) {
1813 update_group_times(leader); 2001 update_group_times(leader);
1814 leader->state = PERF_EVENT_STATE_ERROR; 2002 leader->state = PERF_EVENT_STATE_ERROR;
@@ -2555,7 +2743,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
2555 * because they're strictly cpu affine and rotate_start is called with IRQs 2743 * because they're strictly cpu affine and rotate_start is called with IRQs
2556 * disabled, while rotate_context is called from IRQ context. 2744 * disabled, while rotate_context is called from IRQ context.
2557 */ 2745 */
2558static void perf_rotate_context(struct perf_cpu_context *cpuctx) 2746static int perf_rotate_context(struct perf_cpu_context *cpuctx)
2559{ 2747{
2560 struct perf_event_context *ctx = NULL; 2748 struct perf_event_context *ctx = NULL;
2561 int rotate = 0, remove = 1; 2749 int rotate = 0, remove = 1;
@@ -2594,6 +2782,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2594done: 2782done:
2595 if (remove) 2783 if (remove)
2596 list_del_init(&cpuctx->rotation_list); 2784 list_del_init(&cpuctx->rotation_list);
2785
2786 return rotate;
2597} 2787}
2598 2788
2599#ifdef CONFIG_NO_HZ_FULL 2789#ifdef CONFIG_NO_HZ_FULL
@@ -2625,10 +2815,6 @@ void perf_event_task_tick(void)
2625 ctx = cpuctx->task_ctx; 2815 ctx = cpuctx->task_ctx;
2626 if (ctx) 2816 if (ctx)
2627 perf_adjust_freq_unthr_context(ctx, throttled); 2817 perf_adjust_freq_unthr_context(ctx, throttled);
2628
2629 if (cpuctx->jiffies_interval == 1 ||
2630 !(jiffies % cpuctx->jiffies_interval))
2631 perf_rotate_context(cpuctx);
2632 } 2818 }
2633} 2819}
2634 2820
@@ -2918,6 +3104,7 @@ static void free_event_rcu(struct rcu_head *head)
2918} 3104}
2919 3105
2920static void ring_buffer_put(struct ring_buffer *rb); 3106static void ring_buffer_put(struct ring_buffer *rb);
3107static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
2921 3108
2922static void free_event(struct perf_event *event) 3109static void free_event(struct perf_event *event)
2923{ 3110{
@@ -2942,15 +3129,30 @@ static void free_event(struct perf_event *event)
2942 if (has_branch_stack(event)) { 3129 if (has_branch_stack(event)) {
2943 static_key_slow_dec_deferred(&perf_sched_events); 3130 static_key_slow_dec_deferred(&perf_sched_events);
2944 /* is system-wide event */ 3131 /* is system-wide event */
2945 if (!(event->attach_state & PERF_ATTACH_TASK)) 3132 if (!(event->attach_state & PERF_ATTACH_TASK)) {
2946 atomic_dec(&per_cpu(perf_branch_stack_events, 3133 atomic_dec(&per_cpu(perf_branch_stack_events,
2947 event->cpu)); 3134 event->cpu));
3135 }
2948 } 3136 }
2949 } 3137 }
2950 3138
2951 if (event->rb) { 3139 if (event->rb) {
2952 ring_buffer_put(event->rb); 3140 struct ring_buffer *rb;
2953 event->rb = NULL; 3141
3142 /*
3143 * Can happen when we close an event with re-directed output.
3144 *
3145 * Since we have a 0 refcount, perf_mmap_close() will skip
3146 * over us; possibly making our ring_buffer_put() the last.
3147 */
3148 mutex_lock(&event->mmap_mutex);
3149 rb = event->rb;
3150 if (rb) {
3151 rcu_assign_pointer(event->rb, NULL);
3152 ring_buffer_detach(event, rb);
3153 ring_buffer_put(rb); /* could be last */
3154 }
3155 mutex_unlock(&event->mmap_mutex);
2954 } 3156 }
2955 3157
2956 if (is_cgroup_event(event)) 3158 if (is_cgroup_event(event))
@@ -3188,30 +3390,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
3188 unsigned int events = POLL_HUP; 3390 unsigned int events = POLL_HUP;
3189 3391
3190 /* 3392 /*
3191 * Race between perf_event_set_output() and perf_poll(): perf_poll() 3393 * Pin the event->rb by taking event->mmap_mutex; otherwise
3192 * grabs the rb reference but perf_event_set_output() overrides it. 3394 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
3193 * Here is the timeline for two threads T1, T2:
3194 * t0: T1, rb = rcu_dereference(event->rb)
3195 * t1: T2, old_rb = event->rb
3196 * t2: T2, event->rb = new rb
3197 * t3: T2, ring_buffer_detach(old_rb)
3198 * t4: T1, ring_buffer_attach(rb1)
3199 * t5: T1, poll_wait(event->waitq)
3200 *
3201 * To avoid this problem, we grab mmap_mutex in perf_poll()
3202 * thereby ensuring that the assignment of the new ring buffer
3203 * and the detachment of the old buffer appear atomic to perf_poll()
3204 */ 3395 */
3205 mutex_lock(&event->mmap_mutex); 3396 mutex_lock(&event->mmap_mutex);
3206 3397 rb = event->rb;
3207 rcu_read_lock(); 3398 if (rb)
3208 rb = rcu_dereference(event->rb);
3209 if (rb) {
3210 ring_buffer_attach(event, rb);
3211 events = atomic_xchg(&rb->poll, 0); 3399 events = atomic_xchg(&rb->poll, 0);
3212 }
3213 rcu_read_unlock();
3214
3215 mutex_unlock(&event->mmap_mutex); 3400 mutex_unlock(&event->mmap_mutex);
3216 3401
3217 poll_wait(file, &event->waitq, wait); 3402 poll_wait(file, &event->waitq, wait);
@@ -3521,16 +3706,12 @@ static void ring_buffer_attach(struct perf_event *event,
3521 return; 3706 return;
3522 3707
3523 spin_lock_irqsave(&rb->event_lock, flags); 3708 spin_lock_irqsave(&rb->event_lock, flags);
3524 if (!list_empty(&event->rb_entry)) 3709 if (list_empty(&event->rb_entry))
3525 goto unlock; 3710 list_add(&event->rb_entry, &rb->event_list);
3526
3527 list_add(&event->rb_entry, &rb->event_list);
3528unlock:
3529 spin_unlock_irqrestore(&rb->event_lock, flags); 3711 spin_unlock_irqrestore(&rb->event_lock, flags);
3530} 3712}
3531 3713
3532static void ring_buffer_detach(struct perf_event *event, 3714static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
3533 struct ring_buffer *rb)
3534{ 3715{
3535 unsigned long flags; 3716 unsigned long flags;
3536 3717
@@ -3549,13 +3730,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
3549 3730
3550 rcu_read_lock(); 3731 rcu_read_lock();
3551 rb = rcu_dereference(event->rb); 3732 rb = rcu_dereference(event->rb);
3552 if (!rb) 3733 if (rb) {
3553 goto unlock; 3734 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
3554 3735 wake_up_all(&event->waitq);
3555 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) 3736 }
3556 wake_up_all(&event->waitq);
3557
3558unlock:
3559 rcu_read_unlock(); 3737 rcu_read_unlock();
3560} 3738}
3561 3739
@@ -3584,18 +3762,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3584 3762
3585static void ring_buffer_put(struct ring_buffer *rb) 3763static void ring_buffer_put(struct ring_buffer *rb)
3586{ 3764{
3587 struct perf_event *event, *n;
3588 unsigned long flags;
3589
3590 if (!atomic_dec_and_test(&rb->refcount)) 3765 if (!atomic_dec_and_test(&rb->refcount))
3591 return; 3766 return;
3592 3767
3593 spin_lock_irqsave(&rb->event_lock, flags); 3768 WARN_ON_ONCE(!list_empty(&rb->event_list));
3594 list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
3595 list_del_init(&event->rb_entry);
3596 wake_up_all(&event->waitq);
3597 }
3598 spin_unlock_irqrestore(&rb->event_lock, flags);
3599 3769
3600 call_rcu(&rb->rcu_head, rb_free_rcu); 3770 call_rcu(&rb->rcu_head, rb_free_rcu);
3601} 3771}
@@ -3605,26 +3775,100 @@ static void perf_mmap_open(struct vm_area_struct *vma)
3605 struct perf_event *event = vma->vm_file->private_data; 3775 struct perf_event *event = vma->vm_file->private_data;
3606 3776
3607 atomic_inc(&event->mmap_count); 3777 atomic_inc(&event->mmap_count);
3778 atomic_inc(&event->rb->mmap_count);
3608} 3779}
3609 3780
3781/*
3782 * A buffer can be mmap()ed multiple times; either directly through the same
3783 * event, or through other events by use of perf_event_set_output().
3784 *
3785 * In order to undo the VM accounting done by perf_mmap() we need to destroy
3786 * the buffer here, where we still have a VM context. This means we need
3787 * to detach all events redirecting to us.
3788 */
3610static void perf_mmap_close(struct vm_area_struct *vma) 3789static void perf_mmap_close(struct vm_area_struct *vma)
3611{ 3790{
3612 struct perf_event *event = vma->vm_file->private_data; 3791 struct perf_event *event = vma->vm_file->private_data;
3613 3792
3614 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 3793 struct ring_buffer *rb = event->rb;
3615 unsigned long size = perf_data_size(event->rb); 3794 struct user_struct *mmap_user = rb->mmap_user;
3616 struct user_struct *user = event->mmap_user; 3795 int mmap_locked = rb->mmap_locked;
3617 struct ring_buffer *rb = event->rb; 3796 unsigned long size = perf_data_size(rb);
3797
3798 atomic_dec(&rb->mmap_count);
3799
3800 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
3801 return;
3618 3802
3619 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 3803 /* Detach current event from the buffer. */
3620 vma->vm_mm->pinned_vm -= event->mmap_locked; 3804 rcu_assign_pointer(event->rb, NULL);
3621 rcu_assign_pointer(event->rb, NULL); 3805 ring_buffer_detach(event, rb);
3622 ring_buffer_detach(event, rb); 3806 mutex_unlock(&event->mmap_mutex);
3807
3808 /* If there's still other mmap()s of this buffer, we're done. */
3809 if (atomic_read(&rb->mmap_count)) {
3810 ring_buffer_put(rb); /* can't be last */
3811 return;
3812 }
3813
3814 /*
3815 * No other mmap()s, detach from all other events that might redirect
3816 * into the now unreachable buffer. Somewhat complicated by the
3817 * fact that rb::event_lock otherwise nests inside mmap_mutex.
3818 */
3819again:
3820 rcu_read_lock();
3821 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
3822 if (!atomic_long_inc_not_zero(&event->refcount)) {
3823 /*
3824 * This event is en-route to free_event() which will
3825 * detach it and remove it from the list.
3826 */
3827 continue;
3828 }
3829 rcu_read_unlock();
3830
3831 mutex_lock(&event->mmap_mutex);
3832 /*
3833 * Check we didn't race with perf_event_set_output() which can
3834 * swizzle the rb from under us while we were waiting to
3835 * acquire mmap_mutex.
3836 *
3837 * If we find a different rb; ignore this event, a next
3838 * iteration will no longer find it on the list. We have to
3839 * still restart the iteration to make sure we're not now
3840 * iterating the wrong list.
3841 */
3842 if (event->rb == rb) {
3843 rcu_assign_pointer(event->rb, NULL);
3844 ring_buffer_detach(event, rb);
3845 ring_buffer_put(rb); /* can't be last, we still have one */
3846 }
3623 mutex_unlock(&event->mmap_mutex); 3847 mutex_unlock(&event->mmap_mutex);
3848 put_event(event);
3624 3849
3625 ring_buffer_put(rb); 3850 /*
3626 free_uid(user); 3851 * Restart the iteration; either we're on the wrong list or
3852 * destroyed its integrity by doing a deletion.
3853 */
3854 goto again;
3627 } 3855 }
3856 rcu_read_unlock();
3857
3858 /*
3859 * It could be there's still a few 0-ref events on the list; they'll
3860 * get cleaned up by free_event() -- they'll also still have their
3861 * ref on the rb and will free it whenever they are done with it.
3862 *
3863 * Aside from that, this buffer is 'fully' detached and unmapped,
3864 * undo the VM accounting.
3865 */
3866
3867 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
3868 vma->vm_mm->pinned_vm -= mmap_locked;
3869 free_uid(mmap_user);
3870
3871 ring_buffer_put(rb); /* could be last */
3628} 3872}
3629 3873
3630static const struct vm_operations_struct perf_mmap_vmops = { 3874static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3674,12 +3918,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3674 return -EINVAL; 3918 return -EINVAL;
3675 3919
3676 WARN_ON_ONCE(event->ctx->parent_ctx); 3920 WARN_ON_ONCE(event->ctx->parent_ctx);
3921again:
3677 mutex_lock(&event->mmap_mutex); 3922 mutex_lock(&event->mmap_mutex);
3678 if (event->rb) { 3923 if (event->rb) {
3679 if (event->rb->nr_pages == nr_pages) 3924 if (event->rb->nr_pages != nr_pages) {
3680 atomic_inc(&event->rb->refcount);
3681 else
3682 ret = -EINVAL; 3925 ret = -EINVAL;
3926 goto unlock;
3927 }
3928
3929 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
3930 /*
3931 * Raced against perf_mmap_close() through
3932 * perf_event_set_output(). Try again, hope for better
3933 * luck.
3934 */
3935 mutex_unlock(&event->mmap_mutex);
3936 goto again;
3937 }
3938
3683 goto unlock; 3939 goto unlock;
3684 } 3940 }
3685 3941
@@ -3720,12 +3976,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3720 ret = -ENOMEM; 3976 ret = -ENOMEM;
3721 goto unlock; 3977 goto unlock;
3722 } 3978 }
3723 rcu_assign_pointer(event->rb, rb); 3979
3980 atomic_set(&rb->mmap_count, 1);
3981 rb->mmap_locked = extra;
3982 rb->mmap_user = get_current_user();
3724 3983
3725 atomic_long_add(user_extra, &user->locked_vm); 3984 atomic_long_add(user_extra, &user->locked_vm);
3726 event->mmap_locked = extra; 3985 vma->vm_mm->pinned_vm += extra;
3727 event->mmap_user = get_current_user(); 3986
3728 vma->vm_mm->pinned_vm += event->mmap_locked; 3987 ring_buffer_attach(event, rb);
3988 rcu_assign_pointer(event->rb, rb);
3729 3989
3730 perf_event_update_userpage(event); 3990 perf_event_update_userpage(event);
3731 3991
@@ -3734,7 +3994,11 @@ unlock:
3734 atomic_inc(&event->mmap_count); 3994 atomic_inc(&event->mmap_count);
3735 mutex_unlock(&event->mmap_mutex); 3995 mutex_unlock(&event->mmap_mutex);
3736 3996
3737 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 3997 /*
3998 * Since pinned accounting is per vm we cannot allow fork() to copy our
3999 * vma.
4000 */
4001 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
3738 vma->vm_ops = &perf_mmap_vmops; 4002 vma->vm_ops = &perf_mmap_vmops;
3739 4003
3740 return ret; 4004 return ret;
@@ -4961,7 +5225,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
4961 * sign as trigger. 5225 * sign as trigger.
4962 */ 5226 */
4963 5227
4964static u64 perf_swevent_set_period(struct perf_event *event) 5228u64 perf_swevent_set_period(struct perf_event *event)
4965{ 5229{
4966 struct hw_perf_event *hwc = &event->hw; 5230 struct hw_perf_event *hwc = &event->hw;
4967 u64 period = hwc->last_period; 5231 u64 period = hwc->last_period;
@@ -5904,9 +6168,56 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
5904 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); 6168 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5905} 6169}
5906 6170
6171static ssize_t
6172perf_event_mux_interval_ms_show(struct device *dev,
6173 struct device_attribute *attr,
6174 char *page)
6175{
6176 struct pmu *pmu = dev_get_drvdata(dev);
6177
6178 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
6179}
6180
6181static ssize_t
6182perf_event_mux_interval_ms_store(struct device *dev,
6183 struct device_attribute *attr,
6184 const char *buf, size_t count)
6185{
6186 struct pmu *pmu = dev_get_drvdata(dev);
6187 int timer, cpu, ret;
6188
6189 ret = kstrtoint(buf, 0, &timer);
6190 if (ret)
6191 return ret;
6192
6193 if (timer < 1)
6194 return -EINVAL;
6195
6196 /* same value, noting to do */
6197 if (timer == pmu->hrtimer_interval_ms)
6198 return count;
6199
6200 pmu->hrtimer_interval_ms = timer;
6201
6202 /* update all cpuctx for this PMU */
6203 for_each_possible_cpu(cpu) {
6204 struct perf_cpu_context *cpuctx;
6205 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6206 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
6207
6208 if (hrtimer_active(&cpuctx->hrtimer))
6209 hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
6210 }
6211
6212 return count;
6213}
6214
6215#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
6216
5907static struct device_attribute pmu_dev_attrs[] = { 6217static struct device_attribute pmu_dev_attrs[] = {
5908 __ATTR_RO(type), 6218 __ATTR_RO(type),
5909 __ATTR_NULL, 6219 __ATTR_RW(perf_event_mux_interval_ms),
6220 __ATTR_NULL,
5910}; 6221};
5911 6222
5912static int pmu_bus_running; 6223static int pmu_bus_running;
@@ -5952,7 +6263,7 @@ free_dev:
5952static struct lock_class_key cpuctx_mutex; 6263static struct lock_class_key cpuctx_mutex;
5953static struct lock_class_key cpuctx_lock; 6264static struct lock_class_key cpuctx_lock;
5954 6265
5955int perf_pmu_register(struct pmu *pmu, char *name, int type) 6266int perf_pmu_register(struct pmu *pmu, const char *name, int type)
5956{ 6267{
5957 int cpu, ret; 6268 int cpu, ret;
5958 6269
@@ -6001,7 +6312,9 @@ skip_type:
6001 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); 6312 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
6002 cpuctx->ctx.type = cpu_context; 6313 cpuctx->ctx.type = cpu_context;
6003 cpuctx->ctx.pmu = pmu; 6314 cpuctx->ctx.pmu = pmu;
6004 cpuctx->jiffies_interval = 1; 6315
6316 __perf_cpu_hrtimer_init(cpuctx, cpu);
6317
6005 INIT_LIST_HEAD(&cpuctx->rotation_list); 6318 INIT_LIST_HEAD(&cpuctx->rotation_list);
6006 cpuctx->unique_pmu = pmu; 6319 cpuctx->unique_pmu = pmu;
6007 } 6320 }
@@ -6327,11 +6640,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6327 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) 6640 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
6328 return -EINVAL; 6641 return -EINVAL;
6329 6642
6330 /* kernel level capture: check permissions */
6331 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
6332 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6333 return -EACCES;
6334
6335 /* propagate priv level, when not set for branch */ 6643 /* propagate priv level, when not set for branch */
6336 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { 6644 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
6337 6645
@@ -6349,6 +6657,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6349 */ 6657 */
6350 attr->branch_sample_type = mask; 6658 attr->branch_sample_type = mask;
6351 } 6659 }
6660 /* privileged levels capture (kernel, hv): check permissions */
6661 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
6662 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6663 return -EACCES;
6352 } 6664 }
6353 6665
6354 if (attr->sample_type & PERF_SAMPLE_REGS_USER) { 6666 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -6412,6 +6724,8 @@ set:
6412 if (atomic_read(&event->mmap_count)) 6724 if (atomic_read(&event->mmap_count))
6413 goto unlock; 6725 goto unlock;
6414 6726
6727 old_rb = event->rb;
6728
6415 if (output_event) { 6729 if (output_event) {
6416 /* get the rb we want to redirect to */ 6730 /* get the rb we want to redirect to */
6417 rb = ring_buffer_get(output_event); 6731 rb = ring_buffer_get(output_event);
@@ -6419,16 +6733,28 @@ set:
6419 goto unlock; 6733 goto unlock;
6420 } 6734 }
6421 6735
6422 old_rb = event->rb;
6423 rcu_assign_pointer(event->rb, rb);
6424 if (old_rb) 6736 if (old_rb)
6425 ring_buffer_detach(event, old_rb); 6737 ring_buffer_detach(event, old_rb);
6738
6739 if (rb)
6740 ring_buffer_attach(event, rb);
6741
6742 rcu_assign_pointer(event->rb, rb);
6743
6744 if (old_rb) {
6745 ring_buffer_put(old_rb);
6746 /*
6747 * Since we detached before setting the new rb, so that we
6748 * could attach the new rb, we could have missed a wakeup.
6749 * Provide it now.
6750 */
6751 wake_up_all(&event->waitq);
6752 }
6753
6426 ret = 0; 6754 ret = 0;
6427unlock: 6755unlock:
6428 mutex_unlock(&event->mmap_mutex); 6756 mutex_unlock(&event->mmap_mutex);
6429 6757
6430 if (old_rb)
6431 ring_buffer_put(old_rb);
6432out: 6758out:
6433 return ret; 6759 return ret;
6434} 6760}
@@ -7387,7 +7713,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
7387 case CPU_DOWN_PREPARE: 7713 case CPU_DOWN_PREPARE:
7388 perf_event_exit_cpu(cpu); 7714 perf_event_exit_cpu(cpu);
7389 break; 7715 break;
7390
7391 default: 7716 default:
7392 break; 7717 break;
7393 } 7718 }
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index a64f8aeb5c1f..1559fb0b9296 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -46,23 +46,26 @@
46#include <linux/smp.h> 46#include <linux/smp.h>
47 47
48#include <linux/hw_breakpoint.h> 48#include <linux/hw_breakpoint.h>
49
50
51/* 49/*
52 * Constraints data 50 * Constraints data
53 */ 51 */
52struct bp_cpuinfo {
53 /* Number of pinned cpu breakpoints in a cpu */
54 unsigned int cpu_pinned;
55 /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
56 unsigned int *tsk_pinned;
57 /* Number of non-pinned cpu/task breakpoints in a cpu */
58 unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */
59};
54 60
55/* Number of pinned cpu breakpoints in a cpu */ 61static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);
56static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
57
58/* Number of pinned task breakpoints in a cpu */
59static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
60
61/* Number of non-pinned cpu/task breakpoints in a cpu */
62static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
63
64static int nr_slots[TYPE_MAX]; 62static int nr_slots[TYPE_MAX];
65 63
64static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
65{
66 return per_cpu_ptr(bp_cpuinfo + type, cpu);
67}
68
66/* Keep track of the breakpoints attached to tasks */ 69/* Keep track of the breakpoints attached to tasks */
67static LIST_HEAD(bp_task_head); 70static LIST_HEAD(bp_task_head);
68 71
@@ -96,8 +99,8 @@ static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
96 */ 99 */
97static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) 100static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
98{ 101{
102 unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
99 int i; 103 int i;
100 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
101 104
102 for (i = nr_slots[type] - 1; i >= 0; i--) { 105 for (i = nr_slots[type] - 1; i >= 0; i--) {
103 if (tsk_pinned[i] > 0) 106 if (tsk_pinned[i] > 0)
@@ -120,13 +123,20 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) { 123 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
121 if (iter->hw.bp_target == tsk && 124 if (iter->hw.bp_target == tsk &&
122 find_slot_idx(iter) == type && 125 find_slot_idx(iter) == type &&
123 cpu == iter->cpu) 126 (iter->cpu < 0 || cpu == iter->cpu))
124 count += hw_breakpoint_weight(iter); 127 count += hw_breakpoint_weight(iter);
125 } 128 }
126 129
127 return count; 130 return count;
128} 131}
129 132
133static const struct cpumask *cpumask_of_bp(struct perf_event *bp)
134{
135 if (bp->cpu >= 0)
136 return cpumask_of(bp->cpu);
137 return cpu_possible_mask;
138}
139
130/* 140/*
131 * Report the number of pinned/un-pinned breakpoints we have in 141 * Report the number of pinned/un-pinned breakpoints we have in
132 * a given cpu (cpu > -1) or in all of them (cpu = -1). 142 * a given cpu (cpu > -1) or in all of them (cpu = -1).
@@ -135,25 +145,15 @@ static void
135fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, 145fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
136 enum bp_type_idx type) 146 enum bp_type_idx type)
137{ 147{
138 int cpu = bp->cpu; 148 const struct cpumask *cpumask = cpumask_of_bp(bp);
139 struct task_struct *tsk = bp->hw.bp_target; 149 int cpu;
140
141 if (cpu >= 0) {
142 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
143 if (!tsk)
144 slots->pinned += max_task_bp_pinned(cpu, type);
145 else
146 slots->pinned += task_bp_pinned(cpu, bp, type);
147 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
148
149 return;
150 }
151 150
152 for_each_online_cpu(cpu) { 151 for_each_cpu(cpu, cpumask) {
153 unsigned int nr; 152 struct bp_cpuinfo *info = get_bp_info(cpu, type);
153 int nr;
154 154
155 nr = per_cpu(nr_cpu_bp_pinned[type], cpu); 155 nr = info->cpu_pinned;
156 if (!tsk) 156 if (!bp->hw.bp_target)
157 nr += max_task_bp_pinned(cpu, type); 157 nr += max_task_bp_pinned(cpu, type);
158 else 158 else
159 nr += task_bp_pinned(cpu, bp, type); 159 nr += task_bp_pinned(cpu, bp, type);
@@ -161,8 +161,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
161 if (nr > slots->pinned) 161 if (nr > slots->pinned)
162 slots->pinned = nr; 162 slots->pinned = nr;
163 163
164 nr = per_cpu(nr_bp_flexible[type], cpu); 164 nr = info->flexible;
165
166 if (nr > slots->flexible) 165 if (nr > slots->flexible)
167 slots->flexible = nr; 166 slots->flexible = nr;
168 } 167 }
@@ -182,29 +181,19 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight)
182/* 181/*
183 * Add a pinned breakpoint for the given task in our constraint table 182 * Add a pinned breakpoint for the given task in our constraint table
184 */ 183 */
185static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, 184static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
186 enum bp_type_idx type, int weight) 185 enum bp_type_idx type, int weight)
187{ 186{
188 unsigned int *tsk_pinned; 187 unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
189 int old_count = 0; 188 int old_idx, new_idx;
190 int old_idx = 0; 189
191 int idx = 0; 190 old_idx = task_bp_pinned(cpu, bp, type) - 1;
192 191 new_idx = old_idx + weight;
193 old_count = task_bp_pinned(cpu, bp, type); 192
194 old_idx = old_count - 1; 193 if (old_idx >= 0)
195 idx = old_idx + weight; 194 tsk_pinned[old_idx]--;
196 195 if (new_idx >= 0)
197 /* tsk_pinned[n] is the number of tasks having n breakpoints */ 196 tsk_pinned[new_idx]++;
198 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
199 if (enable) {
200 tsk_pinned[idx]++;
201 if (old_count > 0)
202 tsk_pinned[old_idx]--;
203 } else {
204 tsk_pinned[idx]--;
205 if (old_count > 0)
206 tsk_pinned[old_idx]++;
207 }
208} 197}
209 198
210/* 199/*
@@ -214,33 +203,26 @@ static void
214toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, 203toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
215 int weight) 204 int weight)
216{ 205{
217 int cpu = bp->cpu; 206 const struct cpumask *cpumask = cpumask_of_bp(bp);
218 struct task_struct *tsk = bp->hw.bp_target; 207 int cpu;
219 208
220 /* Pinned counter cpu profiling */ 209 if (!enable)
221 if (!tsk) { 210 weight = -weight;
222 211
223 if (enable) 212 /* Pinned counter cpu profiling */
224 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; 213 if (!bp->hw.bp_target) {
225 else 214 get_bp_info(bp->cpu, type)->cpu_pinned += weight;
226 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
227 return; 215 return;
228 } 216 }
229 217
230 /* Pinned counter task profiling */ 218 /* Pinned counter task profiling */
231 219 for_each_cpu(cpu, cpumask)
232 if (!enable) 220 toggle_bp_task_slot(bp, cpu, type, weight);
233 list_del(&bp->hw.bp_list);
234
235 if (cpu >= 0) {
236 toggle_bp_task_slot(bp, cpu, enable, type, weight);
237 } else {
238 for_each_online_cpu(cpu)
239 toggle_bp_task_slot(bp, cpu, enable, type, weight);
240 }
241 221
242 if (enable) 222 if (enable)
243 list_add_tail(&bp->hw.bp_list, &bp_task_head); 223 list_add_tail(&bp->hw.bp_list, &bp_task_head);
224 else
225 list_del(&bp->hw.bp_list);
244} 226}
245 227
246/* 228/*
@@ -261,8 +243,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
261 * 243 *
262 * - If attached to a single cpu, check: 244 * - If attached to a single cpu, check:
263 * 245 *
264 * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) 246 * (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu)
265 * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM 247 * + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM
266 * 248 *
267 * -> If there are already non-pinned counters in this cpu, it means 249 * -> If there are already non-pinned counters in this cpu, it means
268 * there is already a free slot for them. 250 * there is already a free slot for them.
@@ -272,8 +254,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
272 * 254 *
273 * - If attached to every cpus, check: 255 * - If attached to every cpus, check:
274 * 256 *
275 * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) 257 * (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *))
276 * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM 258 * + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM
277 * 259 *
278 * -> This is roughly the same, except we check the number of per cpu 260 * -> This is roughly the same, except we check the number of per cpu
279 * bp for every cpu and we keep the max one. Same for the per tasks 261 * bp for every cpu and we keep the max one. Same for the per tasks
@@ -284,16 +266,16 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
284 * 266 *
285 * - If attached to a single cpu, check: 267 * - If attached to a single cpu, check:
286 * 268 *
287 * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) 269 * ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu)
288 * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM 270 * + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM
289 * 271 *
290 * -> Same checks as before. But now the nr_bp_flexible, if any, must keep 272 * -> Same checks as before. But now the info->flexible, if any, must keep
291 * one register at least (or they will never be fed). 273 * one register at least (or they will never be fed).
292 * 274 *
293 * - If attached to every cpus, check: 275 * - If attached to every cpus, check:
294 * 276 *
295 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) 277 * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *))
296 * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM 278 * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM
297 */ 279 */
298static int __reserve_bp_slot(struct perf_event *bp) 280static int __reserve_bp_slot(struct perf_event *bp)
299{ 281{
@@ -518,8 +500,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
518 perf_overflow_handler_t triggered, 500 perf_overflow_handler_t triggered,
519 void *context) 501 void *context)
520{ 502{
521 struct perf_event * __percpu *cpu_events, **pevent, *bp; 503 struct perf_event * __percpu *cpu_events, *bp;
522 long err; 504 long err = 0;
523 int cpu; 505 int cpu;
524 506
525 cpu_events = alloc_percpu(typeof(*cpu_events)); 507 cpu_events = alloc_percpu(typeof(*cpu_events));
@@ -528,31 +510,21 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
528 510
529 get_online_cpus(); 511 get_online_cpus();
530 for_each_online_cpu(cpu) { 512 for_each_online_cpu(cpu) {
531 pevent = per_cpu_ptr(cpu_events, cpu);
532 bp = perf_event_create_kernel_counter(attr, cpu, NULL, 513 bp = perf_event_create_kernel_counter(attr, cpu, NULL,
533 triggered, context); 514 triggered, context);
534
535 *pevent = bp;
536
537 if (IS_ERR(bp)) { 515 if (IS_ERR(bp)) {
538 err = PTR_ERR(bp); 516 err = PTR_ERR(bp);
539 goto fail; 517 break;
540 } 518 }
541 }
542 put_online_cpus();
543 519
544 return cpu_events; 520 per_cpu(*cpu_events, cpu) = bp;
545
546fail:
547 for_each_online_cpu(cpu) {
548 pevent = per_cpu_ptr(cpu_events, cpu);
549 if (IS_ERR(*pevent))
550 break;
551 unregister_hw_breakpoint(*pevent);
552 } 521 }
553 put_online_cpus(); 522 put_online_cpus();
554 523
555 free_percpu(cpu_events); 524 if (likely(!err))
525 return cpu_events;
526
527 unregister_wide_hw_breakpoint(cpu_events);
556 return (void __percpu __force *)ERR_PTR(err); 528 return (void __percpu __force *)ERR_PTR(err);
557} 529}
558EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); 530EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
@@ -564,12 +536,10 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
564void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) 536void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
565{ 537{
566 int cpu; 538 int cpu;
567 struct perf_event **pevent;
568 539
569 for_each_possible_cpu(cpu) { 540 for_each_possible_cpu(cpu)
570 pevent = per_cpu_ptr(cpu_events, cpu); 541 unregister_hw_breakpoint(per_cpu(*cpu_events, cpu));
571 unregister_hw_breakpoint(*pevent); 542
572 }
573 free_percpu(cpu_events); 543 free_percpu(cpu_events);
574} 544}
575EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); 545EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
@@ -612,6 +582,11 @@ static int hw_breakpoint_add(struct perf_event *bp, int flags)
612 if (!(flags & PERF_EF_START)) 582 if (!(flags & PERF_EF_START))
613 bp->hw.state = PERF_HES_STOPPED; 583 bp->hw.state = PERF_HES_STOPPED;
614 584
585 if (is_sampling_event(bp)) {
586 bp->hw.last_period = bp->hw.sample_period;
587 perf_swevent_set_period(bp);
588 }
589
615 return arch_install_hw_breakpoint(bp); 590 return arch_install_hw_breakpoint(bp);
616} 591}
617 592
@@ -650,7 +625,6 @@ static struct pmu perf_breakpoint = {
650 625
651int __init init_hw_breakpoint(void) 626int __init init_hw_breakpoint(void)
652{ 627{
653 unsigned int **task_bp_pinned;
654 int cpu, err_cpu; 628 int cpu, err_cpu;
655 int i; 629 int i;
656 630
@@ -659,10 +633,11 @@ int __init init_hw_breakpoint(void)
659 633
660 for_each_possible_cpu(cpu) { 634 for_each_possible_cpu(cpu) {
661 for (i = 0; i < TYPE_MAX; i++) { 635 for (i = 0; i < TYPE_MAX; i++) {
662 task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu); 636 struct bp_cpuinfo *info = get_bp_info(cpu, i);
663 *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i], 637
664 GFP_KERNEL); 638 info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
665 if (!*task_bp_pinned) 639 GFP_KERNEL);
640 if (!info->tsk_pinned)
666 goto err_alloc; 641 goto err_alloc;
667 } 642 }
668 } 643 }
@@ -676,7 +651,7 @@ int __init init_hw_breakpoint(void)
676 err_alloc: 651 err_alloc:
677 for_each_possible_cpu(err_cpu) { 652 for_each_possible_cpu(err_cpu) {
678 for (i = 0; i < TYPE_MAX; i++) 653 for (i = 0; i < TYPE_MAX; i++)
679 kfree(per_cpu(nr_task_bp_pinned[i], err_cpu)); 654 kfree(get_bp_info(err_cpu, i)->tsk_pinned);
680 if (err_cpu == cpu) 655 if (err_cpu == cpu)
681 break; 656 break;
682 } 657 }
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index eb675c4d59df..ca6599723be5 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -31,6 +31,10 @@ struct ring_buffer {
31 spinlock_t event_lock; 31 spinlock_t event_lock;
32 struct list_head event_list; 32 struct list_head event_list;
33 33
34 atomic_t mmap_count;
35 unsigned long mmap_locked;
36 struct user_struct *mmap_user;
37
34 struct perf_event_mmap_page *user_page; 38 struct perf_event_mmap_page *user_page;
35 void *data_pages[0]; 39 void *data_pages[0];
36}; 40};
diff --git a/kernel/exit.c b/kernel/exit.c
index af2eb3cbd499..fafe75d9e6f6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -312,17 +312,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
312 } 312 }
313} 313}
314 314
315void __set_special_pids(struct pid *pid)
316{
317 struct task_struct *curr = current->group_leader;
318
319 if (task_session(curr) != pid)
320 change_pid(curr, PIDTYPE_SID, pid);
321
322 if (task_pgrp(curr) != pid)
323 change_pid(curr, PIDTYPE_PGID, pid);
324}
325
326/* 315/*
327 * Let kernel threads use this to say that they allow a certain signal. 316 * Let kernel threads use this to say that they allow a certain signal.
328 * Must not be used if kthread was cloned with CLONE_SIGHAND. 317 * Must not be used if kthread was cloned with CLONE_SIGHAND.
@@ -649,7 +638,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
649 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) 638 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
650 */ 639 */
651 forget_original_parent(tsk); 640 forget_original_parent(tsk);
652 exit_task_namespaces(tsk);
653 641
654 write_lock_irq(&tasklist_lock); 642 write_lock_irq(&tasklist_lock);
655 if (group_dead) 643 if (group_dead)
@@ -795,6 +783,7 @@ void do_exit(long code)
795 exit_shm(tsk); 783 exit_shm(tsk);
796 exit_files(tsk); 784 exit_files(tsk);
797 exit_fs(tsk); 785 exit_fs(tsk);
786 exit_task_namespaces(tsk);
798 exit_task_work(tsk); 787 exit_task_work(tsk);
799 check_stack_usage(); 788 check_stack_usage();
800 exit_thread(); 789 exit_thread();
@@ -835,7 +824,7 @@ void do_exit(long code)
835 /* 824 /*
836 * Make sure we are holding no locks: 825 * Make sure we are holding no locks:
837 */ 826 */
838 debug_check_no_locks_held(tsk); 827 debug_check_no_locks_held();
839 /* 828 /*
840 * We can do this unlocked here. The futex code uses this flag 829 * We can do this unlocked here. The futex code uses this flag
841 * just to verify whether the pi state cleanup has been done 830 * just to verify whether the pi state cleanup has been done
diff --git a/kernel/fork.c b/kernel/fork.c
index 987b28a1f01b..6e6a1c11b3e5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1121,6 +1121,12 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
1121 INIT_LIST_HEAD(&tsk->cpu_timers[2]); 1121 INIT_LIST_HEAD(&tsk->cpu_timers[2]);
1122} 1122}
1123 1123
1124static inline void
1125init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
1126{
1127 task->pids[type].pid = pid;
1128}
1129
1124/* 1130/*
1125 * This creates a new process as a copy of the old one, 1131 * This creates a new process as a copy of the old one,
1126 * but does not actually start it yet. 1132 * but does not actually start it yet.
@@ -1199,8 +1205,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1199 retval = -EAGAIN; 1205 retval = -EAGAIN;
1200 if (atomic_read(&p->real_cred->user->processes) >= 1206 if (atomic_read(&p->real_cred->user->processes) >=
1201 task_rlimit(p, RLIMIT_NPROC)) { 1207 task_rlimit(p, RLIMIT_NPROC)) {
1202 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 1208 if (p->real_cred->user != INIT_USER &&
1203 p->real_cred->user != INIT_USER) 1209 !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
1204 goto bad_fork_free; 1210 goto bad_fork_free;
1205 } 1211 }
1206 current->flags &= ~PF_NPROC_EXCEEDED; 1212 current->flags &= ~PF_NPROC_EXCEEDED;
@@ -1354,11 +1360,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1354 goto bad_fork_cleanup_io; 1360 goto bad_fork_cleanup_io;
1355 } 1361 }
1356 1362
1357 p->pid = pid_nr(pid);
1358 p->tgid = p->pid;
1359 if (clone_flags & CLONE_THREAD)
1360 p->tgid = current->tgid;
1361
1362 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1363 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1363 /* 1364 /*
1364 * Clear TID on mm_release()? 1365 * Clear TID on mm_release()?
@@ -1394,12 +1395,19 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1394 clear_all_latency_tracing(p); 1395 clear_all_latency_tracing(p);
1395 1396
1396 /* ok, now we should be set up.. */ 1397 /* ok, now we should be set up.. */
1397 if (clone_flags & CLONE_THREAD) 1398 p->pid = pid_nr(pid);
1399 if (clone_flags & CLONE_THREAD) {
1398 p->exit_signal = -1; 1400 p->exit_signal = -1;
1399 else if (clone_flags & CLONE_PARENT) 1401 p->group_leader = current->group_leader;
1400 p->exit_signal = current->group_leader->exit_signal; 1402 p->tgid = current->tgid;
1401 else 1403 } else {
1402 p->exit_signal = (clone_flags & CSIGNAL); 1404 if (clone_flags & CLONE_PARENT)
1405 p->exit_signal = current->group_leader->exit_signal;
1406 else
1407 p->exit_signal = (clone_flags & CSIGNAL);
1408 p->group_leader = p;
1409 p->tgid = p->pid;
1410 }
1403 1411
1404 p->pdeath_signal = 0; 1412 p->pdeath_signal = 0;
1405 p->exit_state = 0; 1413 p->exit_state = 0;
@@ -1408,15 +1416,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1408 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); 1416 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
1409 p->dirty_paused_when = 0; 1417 p->dirty_paused_when = 0;
1410 1418
1411 /*
1412 * Ok, make it visible to the rest of the system.
1413 * We dont wake it up yet.
1414 */
1415 p->group_leader = p;
1416 INIT_LIST_HEAD(&p->thread_group); 1419 INIT_LIST_HEAD(&p->thread_group);
1417 p->task_works = NULL; 1420 p->task_works = NULL;
1418 1421
1419 /* Need tasklist lock for parent etc handling! */ 1422 /*
1423 * Make it visible to the rest of the system, but dont wake it up yet.
1424 * Need tasklist lock for parent etc handling!
1425 */
1420 write_lock_irq(&tasklist_lock); 1426 write_lock_irq(&tasklist_lock);
1421 1427
1422 /* CLONE_PARENT re-uses the old parent */ 1428 /* CLONE_PARENT re-uses the old parent */
@@ -1446,18 +1452,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1446 goto bad_fork_free_pid; 1452 goto bad_fork_free_pid;
1447 } 1453 }
1448 1454
1449 if (clone_flags & CLONE_THREAD) {
1450 current->signal->nr_threads++;
1451 atomic_inc(&current->signal->live);
1452 atomic_inc(&current->signal->sigcnt);
1453 p->group_leader = current->group_leader;
1454 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1455 }
1456
1457 if (likely(p->pid)) { 1455 if (likely(p->pid)) {
1458 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); 1456 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
1459 1457
1458 init_task_pid(p, PIDTYPE_PID, pid);
1460 if (thread_group_leader(p)) { 1459 if (thread_group_leader(p)) {
1460 init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
1461 init_task_pid(p, PIDTYPE_SID, task_session(current));
1462
1461 if (is_child_reaper(pid)) { 1463 if (is_child_reaper(pid)) {
1462 ns_of_pid(pid)->child_reaper = p; 1464 ns_of_pid(pid)->child_reaper = p;
1463 p->signal->flags |= SIGNAL_UNKILLABLE; 1465 p->signal->flags |= SIGNAL_UNKILLABLE;
@@ -1465,13 +1467,19 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1465 1467
1466 p->signal->leader_pid = pid; 1468 p->signal->leader_pid = pid;
1467 p->signal->tty = tty_kref_get(current->signal->tty); 1469 p->signal->tty = tty_kref_get(current->signal->tty);
1468 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1469 attach_pid(p, PIDTYPE_SID, task_session(current));
1470 list_add_tail(&p->sibling, &p->real_parent->children); 1470 list_add_tail(&p->sibling, &p->real_parent->children);
1471 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1471 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1472 attach_pid(p, PIDTYPE_PGID);
1473 attach_pid(p, PIDTYPE_SID);
1472 __this_cpu_inc(process_counts); 1474 __this_cpu_inc(process_counts);
1475 } else {
1476 current->signal->nr_threads++;
1477 atomic_inc(&current->signal->live);
1478 atomic_inc(&current->signal->sigcnt);
1479 list_add_tail_rcu(&p->thread_group,
1480 &p->group_leader->thread_group);
1473 } 1481 }
1474 attach_pid(p, PIDTYPE_PID, pid); 1482 attach_pid(p, PIDTYPE_PID);
1475 nr_threads++; 1483 nr_threads++;
1476 } 1484 }
1477 1485
diff --git a/kernel/freezer.c b/kernel/freezer.c
index c38893b0efba..8b2afc1c9df0 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -110,6 +110,18 @@ bool freeze_task(struct task_struct *p)
110{ 110{
111 unsigned long flags; 111 unsigned long flags;
112 112
113 /*
114 * This check can race with freezer_do_not_count, but worst case that
115 * will result in an extra wakeup being sent to the task. It does not
116 * race with freezer_count(), the barriers in freezer_count() and
117 * freezer_should_skip() ensure that either freezer_count() sees
118 * freezing == true in try_to_freeze() and freezes, or
119 * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task
120 * normally.
121 */
122 if (freezer_should_skip(p))
123 return false;
124
113 spin_lock_irqsave(&freezer_lock, flags); 125 spin_lock_irqsave(&freezer_lock, flags);
114 if (!freezing(p) || frozen(p)) { 126 if (!freezing(p) || frozen(p)) {
115 spin_unlock_irqrestore(&freezer_lock, flags); 127 spin_unlock_irqrestore(&freezer_lock, flags);
diff --git a/kernel/futex.c b/kernel/futex.c
index b26dcfc02c94..c3a1a55a5214 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -61,6 +61,8 @@
61#include <linux/nsproxy.h> 61#include <linux/nsproxy.h>
62#include <linux/ptrace.h> 62#include <linux/ptrace.h>
63#include <linux/sched/rt.h> 63#include <linux/sched/rt.h>
64#include <linux/hugetlb.h>
65#include <linux/freezer.h>
64 66
65#include <asm/futex.h> 67#include <asm/futex.h>
66 68
@@ -365,7 +367,7 @@ again:
365 } else { 367 } else {
366 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 368 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
367 key->shared.inode = page_head->mapping->host; 369 key->shared.inode = page_head->mapping->host;
368 key->shared.pgoff = page_head->index; 370 key->shared.pgoff = basepage_index(page);
369 } 371 }
370 372
371 get_futex_key_refs(key); 373 get_futex_key_refs(key);
@@ -1807,7 +1809,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1807 * is no timeout, or if it has yet to expire. 1809 * is no timeout, or if it has yet to expire.
1808 */ 1810 */
1809 if (!timeout || timeout->task) 1811 if (!timeout || timeout->task)
1810 schedule(); 1812 freezable_schedule();
1811 } 1813 }
1812 __set_current_state(TASK_RUNNING); 1814 __set_current_state(TASK_RUNNING);
1813} 1815}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index fd4b13b131f8..3ee4d06c6fc2 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -47,6 +47,7 @@
47#include <linux/sched/sysctl.h> 47#include <linux/sched/sysctl.h>
48#include <linux/sched/rt.h> 48#include <linux/sched/rt.h>
49#include <linux/timer.h> 49#include <linux/timer.h>
50#include <linux/freezer.h>
50 51
51#include <asm/uaccess.h> 52#include <asm/uaccess.h>
52 53
@@ -1545,7 +1546,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
1545 t->task = NULL; 1546 t->task = NULL;
1546 1547
1547 if (likely(t->task)) 1548 if (likely(t->task))
1548 schedule(); 1549 freezable_schedule();
1549 1550
1550 hrtimer_cancel(&t->timer); 1551 hrtimer_cancel(&t->timer);
1551 mode = HRTIMER_MODE_ABS; 1552 mode = HRTIMER_MODE_ABS;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index cbd97ce0b000..a3bb14fbe5c6 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -213,6 +213,19 @@ void irq_enable(struct irq_desc *desc)
213 irq_state_clr_masked(desc); 213 irq_state_clr_masked(desc);
214} 214}
215 215
216/**
217 * irq_disable - Mark interupt disabled
218 * @desc: irq descriptor which should be disabled
219 *
220 * If the chip does not implement the irq_disable callback, we
221 * use a lazy disable approach. That means we mark the interrupt
222 * disabled, but leave the hardware unmasked. That's an
223 * optimization because we avoid the hardware access for the
224 * common case where no interrupt happens after we marked it
225 * disabled. If an interrupt happens, then the interrupt flow
226 * handler masks the line at the hardware level and marks it
227 * pending.
228 */
216void irq_disable(struct irq_desc *desc) 229void irq_disable(struct irq_desc *desc)
217{ 230{
218 irq_state_set_disabled(desc); 231 irq_state_set_disabled(desc);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c89295a8f668..10e663ab1f4a 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -7,6 +7,7 @@
7#include <linux/irq.h> 7#include <linux/irq.h>
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/export.h> 9#include <linux/export.h>
10#include <linux/irqdomain.h>
10#include <linux/interrupt.h> 11#include <linux/interrupt.h>
11#include <linux/kernel_stat.h> 12#include <linux/kernel_stat.h>
12#include <linux/syscore_ops.h> 13#include <linux/syscore_ops.h>
@@ -16,11 +17,6 @@
16static LIST_HEAD(gc_list); 17static LIST_HEAD(gc_list);
17static DEFINE_RAW_SPINLOCK(gc_lock); 18static DEFINE_RAW_SPINLOCK(gc_lock);
18 19
19static inline struct irq_chip_regs *cur_regs(struct irq_data *d)
20{
21 return &container_of(d->chip, struct irq_chip_type, chip)->regs;
22}
23
24/** 20/**
25 * irq_gc_noop - NOOP function 21 * irq_gc_noop - NOOP function
26 * @d: irq_data 22 * @d: irq_data
@@ -39,16 +35,17 @@ void irq_gc_noop(struct irq_data *d)
39void irq_gc_mask_disable_reg(struct irq_data *d) 35void irq_gc_mask_disable_reg(struct irq_data *d)
40{ 36{
41 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 37 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
42 u32 mask = 1 << (d->irq - gc->irq_base); 38 struct irq_chip_type *ct = irq_data_get_chip_type(d);
39 u32 mask = d->mask;
43 40
44 irq_gc_lock(gc); 41 irq_gc_lock(gc);
45 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable); 42 irq_reg_writel(mask, gc->reg_base + ct->regs.disable);
46 gc->mask_cache &= ~mask; 43 *ct->mask_cache &= ~mask;
47 irq_gc_unlock(gc); 44 irq_gc_unlock(gc);
48} 45}
49 46
50/** 47/**
51 * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register 48 * irq_gc_mask_set_bit - Mask chip via setting bit in mask register
52 * @d: irq_data 49 * @d: irq_data
53 * 50 *
54 * Chip has a single mask register. Values of this register are cached 51 * Chip has a single mask register. Values of this register are cached
@@ -57,16 +54,18 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
57void irq_gc_mask_set_bit(struct irq_data *d) 54void irq_gc_mask_set_bit(struct irq_data *d)
58{ 55{
59 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 56 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
60 u32 mask = 1 << (d->irq - gc->irq_base); 57 struct irq_chip_type *ct = irq_data_get_chip_type(d);
58 u32 mask = d->mask;
61 59
62 irq_gc_lock(gc); 60 irq_gc_lock(gc);
63 gc->mask_cache |= mask; 61 *ct->mask_cache |= mask;
64 irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); 62 irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
65 irq_gc_unlock(gc); 63 irq_gc_unlock(gc);
66} 64}
65EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit);
67 66
68/** 67/**
69 * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register 68 * irq_gc_mask_clr_bit - Mask chip via clearing bit in mask register
70 * @d: irq_data 69 * @d: irq_data
71 * 70 *
72 * Chip has a single mask register. Values of this register are cached 71 * Chip has a single mask register. Values of this register are cached
@@ -75,13 +74,15 @@ void irq_gc_mask_set_bit(struct irq_data *d)
75void irq_gc_mask_clr_bit(struct irq_data *d) 74void irq_gc_mask_clr_bit(struct irq_data *d)
76{ 75{
77 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 76 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
78 u32 mask = 1 << (d->irq - gc->irq_base); 77 struct irq_chip_type *ct = irq_data_get_chip_type(d);
78 u32 mask = d->mask;
79 79
80 irq_gc_lock(gc); 80 irq_gc_lock(gc);
81 gc->mask_cache &= ~mask; 81 *ct->mask_cache &= ~mask;
82 irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); 82 irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
83 irq_gc_unlock(gc); 83 irq_gc_unlock(gc);
84} 84}
85EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit);
85 86
86/** 87/**
87 * irq_gc_unmask_enable_reg - Unmask chip via enable register 88 * irq_gc_unmask_enable_reg - Unmask chip via enable register
@@ -93,11 +94,12 @@ void irq_gc_mask_clr_bit(struct irq_data *d)
93void irq_gc_unmask_enable_reg(struct irq_data *d) 94void irq_gc_unmask_enable_reg(struct irq_data *d)
94{ 95{
95 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 96 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
96 u32 mask = 1 << (d->irq - gc->irq_base); 97 struct irq_chip_type *ct = irq_data_get_chip_type(d);
98 u32 mask = d->mask;
97 99
98 irq_gc_lock(gc); 100 irq_gc_lock(gc);
99 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable); 101 irq_reg_writel(mask, gc->reg_base + ct->regs.enable);
100 gc->mask_cache |= mask; 102 *ct->mask_cache |= mask;
101 irq_gc_unlock(gc); 103 irq_gc_unlock(gc);
102} 104}
103 105
@@ -108,12 +110,14 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
108void irq_gc_ack_set_bit(struct irq_data *d) 110void irq_gc_ack_set_bit(struct irq_data *d)
109{ 111{
110 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 112 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
111 u32 mask = 1 << (d->irq - gc->irq_base); 113 struct irq_chip_type *ct = irq_data_get_chip_type(d);
114 u32 mask = d->mask;
112 115
113 irq_gc_lock(gc); 116 irq_gc_lock(gc);
114 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); 117 irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
115 irq_gc_unlock(gc); 118 irq_gc_unlock(gc);
116} 119}
120EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit);
117 121
118/** 122/**
119 * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit 123 * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit
@@ -122,25 +126,27 @@ void irq_gc_ack_set_bit(struct irq_data *d)
122void irq_gc_ack_clr_bit(struct irq_data *d) 126void irq_gc_ack_clr_bit(struct irq_data *d)
123{ 127{
124 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 128 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
125 u32 mask = ~(1 << (d->irq - gc->irq_base)); 129 struct irq_chip_type *ct = irq_data_get_chip_type(d);
130 u32 mask = ~d->mask;
126 131
127 irq_gc_lock(gc); 132 irq_gc_lock(gc);
128 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); 133 irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
129 irq_gc_unlock(gc); 134 irq_gc_unlock(gc);
130} 135}
131 136
132/** 137/**
133 * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt 138 * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt
134 * @d: irq_data 139 * @d: irq_data
135 */ 140 */
136void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) 141void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
137{ 142{
138 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 143 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
139 u32 mask = 1 << (d->irq - gc->irq_base); 144 struct irq_chip_type *ct = irq_data_get_chip_type(d);
145 u32 mask = d->mask;
140 146
141 irq_gc_lock(gc); 147 irq_gc_lock(gc);
142 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask); 148 irq_reg_writel(mask, gc->reg_base + ct->regs.mask);
143 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); 149 irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
144 irq_gc_unlock(gc); 150 irq_gc_unlock(gc);
145} 151}
146 152
@@ -151,16 +157,18 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
151void irq_gc_eoi(struct irq_data *d) 157void irq_gc_eoi(struct irq_data *d)
152{ 158{
153 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 159 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
154 u32 mask = 1 << (d->irq - gc->irq_base); 160 struct irq_chip_type *ct = irq_data_get_chip_type(d);
161 u32 mask = d->mask;
155 162
156 irq_gc_lock(gc); 163 irq_gc_lock(gc);
157 irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi); 164 irq_reg_writel(mask, gc->reg_base + ct->regs.eoi);
158 irq_gc_unlock(gc); 165 irq_gc_unlock(gc);
159} 166}
160 167
161/** 168/**
162 * irq_gc_set_wake - Set/clr wake bit for an interrupt 169 * irq_gc_set_wake - Set/clr wake bit for an interrupt
163 * @d: irq_data 170 * @d: irq_data
171 * @on: Indicates whether the wake bit should be set or cleared
164 * 172 *
165 * For chips where the wake from suspend functionality is not 173 * For chips where the wake from suspend functionality is not
166 * configured in a separate register and the wakeup active state is 174 * configured in a separate register and the wakeup active state is
@@ -169,7 +177,7 @@ void irq_gc_eoi(struct irq_data *d)
169int irq_gc_set_wake(struct irq_data *d, unsigned int on) 177int irq_gc_set_wake(struct irq_data *d, unsigned int on)
170{ 178{
171 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); 179 struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
172 u32 mask = 1 << (d->irq - gc->irq_base); 180 u32 mask = d->mask;
173 181
174 if (!(mask & gc->wake_enabled)) 182 if (!(mask & gc->wake_enabled))
175 return -EINVAL; 183 return -EINVAL;
@@ -183,6 +191,19 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
183 return 0; 191 return 0;
184} 192}
185 193
194static void
195irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
196 int num_ct, unsigned int irq_base,
197 void __iomem *reg_base, irq_flow_handler_t handler)
198{
199 raw_spin_lock_init(&gc->lock);
200 gc->num_ct = num_ct;
201 gc->irq_base = irq_base;
202 gc->reg_base = reg_base;
203 gc->chip_types->chip.name = name;
204 gc->chip_types->handler = handler;
205}
206
186/** 207/**
187 * irq_alloc_generic_chip - Allocate a generic chip and initialize it 208 * irq_alloc_generic_chip - Allocate a generic chip and initialize it
188 * @name: Name of the irq chip 209 * @name: Name of the irq chip
@@ -203,23 +224,183 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
203 224
204 gc = kzalloc(sz, GFP_KERNEL); 225 gc = kzalloc(sz, GFP_KERNEL);
205 if (gc) { 226 if (gc) {
206 raw_spin_lock_init(&gc->lock); 227 irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base,
207 gc->num_ct = num_ct; 228 handler);
208 gc->irq_base = irq_base;
209 gc->reg_base = reg_base;
210 gc->chip_types->chip.name = name;
211 gc->chip_types->handler = handler;
212 } 229 }
213 return gc; 230 return gc;
214} 231}
215EXPORT_SYMBOL_GPL(irq_alloc_generic_chip); 232EXPORT_SYMBOL_GPL(irq_alloc_generic_chip);
216 233
234static void
235irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
236{
237 struct irq_chip_type *ct = gc->chip_types;
238 u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask;
239 int i;
240
241 for (i = 0; i < gc->num_ct; i++) {
242 if (flags & IRQ_GC_MASK_CACHE_PER_TYPE) {
243 mskptr = &ct[i].mask_cache_priv;
244 mskreg = ct[i].regs.mask;
245 }
246 ct[i].mask_cache = mskptr;
247 if (flags & IRQ_GC_INIT_MASK_CACHE)
248 *mskptr = irq_reg_readl(gc->reg_base + mskreg);
249 }
250}
251
252/**
253 * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain
254 * @d: irq domain for which to allocate chips
255 * @irqs_per_chip: Number of interrupts each chip handles
256 * @num_ct: Number of irq_chip_type instances associated with this
257 * @name: Name of the irq chip
258 * @handler: Default flow handler associated with these chips
259 * @clr: IRQ_* bits to clear in the mapping function
260 * @set: IRQ_* bits to set in the mapping function
261 * @gcflags: Generic chip specific setup flags
262 */
263int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
264 int num_ct, const char *name,
265 irq_flow_handler_t handler,
266 unsigned int clr, unsigned int set,
267 enum irq_gc_flags gcflags)
268{
269 struct irq_domain_chip_generic *dgc;
270 struct irq_chip_generic *gc;
271 int numchips, sz, i;
272 unsigned long flags;
273 void *tmp;
274
275 if (d->gc)
276 return -EBUSY;
277
278 numchips = d->revmap_size / irqs_per_chip;
279 if (!numchips)
280 return -EINVAL;
281
282 /* Allocate a pointer, generic chip and chiptypes for each chip */
283 sz = sizeof(*dgc) + numchips * sizeof(gc);
284 sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type));
285
286 tmp = dgc = kzalloc(sz, GFP_KERNEL);
287 if (!dgc)
288 return -ENOMEM;
289 dgc->irqs_per_chip = irqs_per_chip;
290 dgc->num_chips = numchips;
291 dgc->irq_flags_to_set = set;
292 dgc->irq_flags_to_clear = clr;
293 dgc->gc_flags = gcflags;
294 d->gc = dgc;
295
296 /* Calc pointer to the first generic chip */
297 tmp += sizeof(*dgc) + numchips * sizeof(gc);
298 for (i = 0; i < numchips; i++) {
299 /* Store the pointer to the generic chip */
300 dgc->gc[i] = gc = tmp;
301 irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip,
302 NULL, handler);
303 gc->domain = d;
304 raw_spin_lock_irqsave(&gc_lock, flags);
305 list_add_tail(&gc->list, &gc_list);
306 raw_spin_unlock_irqrestore(&gc_lock, flags);
307 /* Calc pointer to the next generic chip */
308 tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
309 }
310 d->name = name;
311 return 0;
312}
313EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
314
315/**
316 * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq
317 * @d: irq domain pointer
318 * @hw_irq: Hardware interrupt number
319 */
320struct irq_chip_generic *
321irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
322{
323 struct irq_domain_chip_generic *dgc = d->gc;
324 int idx;
325
326 if (!dgc)
327 return NULL;
328 idx = hw_irq / dgc->irqs_per_chip;
329 if (idx >= dgc->num_chips)
330 return NULL;
331 return dgc->gc[idx];
332}
333EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
334
217/* 335/*
218 * Separate lockdep class for interrupt chip which can nest irq_desc 336 * Separate lockdep class for interrupt chip which can nest irq_desc
219 * lock. 337 * lock.
220 */ 338 */
221static struct lock_class_key irq_nested_lock_class; 339static struct lock_class_key irq_nested_lock_class;
222 340
341/*
342 * irq_map_generic_chip - Map a generic chip for an irq domain
343 */
344static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
345 irq_hw_number_t hw_irq)
346{
347 struct irq_data *data = irq_get_irq_data(virq);
348 struct irq_domain_chip_generic *dgc = d->gc;
349 struct irq_chip_generic *gc;
350 struct irq_chip_type *ct;
351 struct irq_chip *chip;
352 unsigned long flags;
353 int idx;
354
355 if (!d->gc)
356 return -ENODEV;
357
358 idx = hw_irq / dgc->irqs_per_chip;
359 if (idx >= dgc->num_chips)
360 return -EINVAL;
361 gc = dgc->gc[idx];
362
363 idx = hw_irq % dgc->irqs_per_chip;
364
365 if (test_bit(idx, &gc->unused))
366 return -ENOTSUPP;
367
368 if (test_bit(idx, &gc->installed))
369 return -EBUSY;
370
371 ct = gc->chip_types;
372 chip = &ct->chip;
373
374 /* We only init the cache for the first mapping of a generic chip */
375 if (!gc->installed) {
376 raw_spin_lock_irqsave(&gc->lock, flags);
377 irq_gc_init_mask_cache(gc, dgc->gc_flags);
378 raw_spin_unlock_irqrestore(&gc->lock, flags);
379 }
380
381 /* Mark the interrupt as installed */
382 set_bit(idx, &gc->installed);
383
384 if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK)
385 irq_set_lockdep_class(virq, &irq_nested_lock_class);
386
387 if (chip->irq_calc_mask)
388 chip->irq_calc_mask(data);
389 else
390 data->mask = 1 << idx;
391
392 irq_set_chip_and_handler(virq, chip, ct->handler);
393 irq_set_chip_data(virq, gc);
394 irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
395 return 0;
396}
397
398struct irq_domain_ops irq_generic_chip_ops = {
399 .map = irq_map_generic_chip,
400 .xlate = irq_domain_xlate_onetwocell,
401};
402EXPORT_SYMBOL_GPL(irq_generic_chip_ops);
403
223/** 404/**
224 * irq_setup_generic_chip - Setup a range of interrupts with a generic chip 405 * irq_setup_generic_chip - Setup a range of interrupts with a generic chip
225 * @gc: Generic irq chip holding all data 406 * @gc: Generic irq chip holding all data
@@ -237,15 +418,14 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
237 unsigned int set) 418 unsigned int set)
238{ 419{
239 struct irq_chip_type *ct = gc->chip_types; 420 struct irq_chip_type *ct = gc->chip_types;
421 struct irq_chip *chip = &ct->chip;
240 unsigned int i; 422 unsigned int i;
241 423
242 raw_spin_lock(&gc_lock); 424 raw_spin_lock(&gc_lock);
243 list_add_tail(&gc->list, &gc_list); 425 list_add_tail(&gc->list, &gc_list);
244 raw_spin_unlock(&gc_lock); 426 raw_spin_unlock(&gc_lock);
245 427
246 /* Init mask cache ? */ 428 irq_gc_init_mask_cache(gc, flags);
247 if (flags & IRQ_GC_INIT_MASK_CACHE)
248 gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
249 429
250 for (i = gc->irq_base; msk; msk >>= 1, i++) { 430 for (i = gc->irq_base; msk; msk >>= 1, i++) {
251 if (!(msk & 0x01)) 431 if (!(msk & 0x01))
@@ -254,7 +434,15 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
254 if (flags & IRQ_GC_INIT_NESTED_LOCK) 434 if (flags & IRQ_GC_INIT_NESTED_LOCK)
255 irq_set_lockdep_class(i, &irq_nested_lock_class); 435 irq_set_lockdep_class(i, &irq_nested_lock_class);
256 436
257 irq_set_chip_and_handler(i, &ct->chip, ct->handler); 437 if (!(flags & IRQ_GC_NO_MASK)) {
438 struct irq_data *d = irq_get_irq_data(i);
439
440 if (chip->irq_calc_mask)
441 chip->irq_calc_mask(d);
442 else
443 d->mask = 1 << (i - gc->irq_base);
444 }
445 irq_set_chip_and_handler(i, chip, ct->handler);
258 irq_set_chip_data(i, gc); 446 irq_set_chip_data(i, gc);
259 irq_modify_status(i, clr, set); 447 irq_modify_status(i, clr, set);
260 } 448 }
@@ -265,7 +453,7 @@ EXPORT_SYMBOL_GPL(irq_setup_generic_chip);
265/** 453/**
266 * irq_setup_alt_chip - Switch to alternative chip 454 * irq_setup_alt_chip - Switch to alternative chip
267 * @d: irq_data for this interrupt 455 * @d: irq_data for this interrupt
268 * @type Flow type to be initialized 456 * @type: Flow type to be initialized
269 * 457 *
270 * Only to be called from chip->irq_set_type() callbacks. 458 * Only to be called from chip->irq_set_type() callbacks.
271 */ 459 */
@@ -317,6 +505,24 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
317} 505}
318EXPORT_SYMBOL_GPL(irq_remove_generic_chip); 506EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
319 507
508static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc)
509{
510 unsigned int virq;
511
512 if (!gc->domain)
513 return irq_get_irq_data(gc->irq_base);
514
515 /*
516 * We don't know which of the irqs has been actually
517 * installed. Use the first one.
518 */
519 if (!gc->installed)
520 return NULL;
521
522 virq = irq_find_mapping(gc->domain, gc->irq_base + __ffs(gc->installed));
523 return virq ? irq_get_irq_data(virq) : NULL;
524}
525
320#ifdef CONFIG_PM 526#ifdef CONFIG_PM
321static int irq_gc_suspend(void) 527static int irq_gc_suspend(void)
322{ 528{
@@ -325,8 +531,12 @@ static int irq_gc_suspend(void)
325 list_for_each_entry(gc, &gc_list, list) { 531 list_for_each_entry(gc, &gc_list, list) {
326 struct irq_chip_type *ct = gc->chip_types; 532 struct irq_chip_type *ct = gc->chip_types;
327 533
328 if (ct->chip.irq_suspend) 534 if (ct->chip.irq_suspend) {
329 ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base)); 535 struct irq_data *data = irq_gc_get_irq_data(gc);
536
537 if (data)
538 ct->chip.irq_suspend(data);
539 }
330 } 540 }
331 return 0; 541 return 0;
332} 542}
@@ -338,8 +548,12 @@ static void irq_gc_resume(void)
338 list_for_each_entry(gc, &gc_list, list) { 548 list_for_each_entry(gc, &gc_list, list) {
339 struct irq_chip_type *ct = gc->chip_types; 549 struct irq_chip_type *ct = gc->chip_types;
340 550
341 if (ct->chip.irq_resume) 551 if (ct->chip.irq_resume) {
342 ct->chip.irq_resume(irq_get_irq_data(gc->irq_base)); 552 struct irq_data *data = irq_gc_get_irq_data(gc);
553
554 if (data)
555 ct->chip.irq_resume(data);
556 }
343 } 557 }
344} 558}
345#else 559#else
@@ -354,8 +568,12 @@ static void irq_gc_shutdown(void)
354 list_for_each_entry(gc, &gc_list, list) { 568 list_for_each_entry(gc, &gc_list, list) {
355 struct irq_chip_type *ct = gc->chip_types; 569 struct irq_chip_type *ct = gc->chip_types;
356 570
357 if (ct->chip.irq_pm_shutdown) 571 if (ct->chip.irq_pm_shutdown) {
358 ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base)); 572 struct irq_data *data = irq_gc_get_irq_data(gc);
573
574 if (data)
575 ct->chip.irq_pm_shutdown(data);
576 }
359 } 577 }
360} 578}
361 579
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 54a4d5223238..2d7cd3428365 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -16,12 +16,6 @@
16#include <linux/smp.h> 16#include <linux/smp.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18 18
19#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs.
20 * ie. legacy 8259, gets irqs 1..15 */
21#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */
22#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */
23#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */
24
25static LIST_HEAD(irq_domain_list); 19static LIST_HEAD(irq_domain_list);
26static DEFINE_MUTEX(irq_domain_mutex); 20static DEFINE_MUTEX(irq_domain_mutex);
27 21
@@ -29,9 +23,11 @@ static DEFINE_MUTEX(revmap_trees_mutex);
29static struct irq_domain *irq_default_domain; 23static struct irq_domain *irq_default_domain;
30 24
31/** 25/**
32 * irq_domain_alloc() - Allocate a new irq_domain data structure 26 * __irq_domain_add() - Allocate a new irq_domain data structure
33 * @of_node: optional device-tree node of the interrupt controller 27 * @of_node: optional device-tree node of the interrupt controller
34 * @revmap_type: type of reverse mapping to use 28 * @size: Size of linear map; 0 for radix mapping only
29 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
30 * direct mapping
35 * @ops: map/unmap domain callbacks 31 * @ops: map/unmap domain callbacks
36 * @host_data: Controller private data pointer 32 * @host_data: Controller private data pointer
37 * 33 *
@@ -39,41 +35,35 @@ static struct irq_domain *irq_default_domain;
39 * register allocated irq_domain with irq_domain_register(). Returns pointer 35 * register allocated irq_domain with irq_domain_register(). Returns pointer
40 * to IRQ domain, or NULL on failure. 36 * to IRQ domain, or NULL on failure.
41 */ 37 */
42static struct irq_domain *irq_domain_alloc(struct device_node *of_node, 38struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
43 unsigned int revmap_type, 39 irq_hw_number_t hwirq_max, int direct_max,
44 const struct irq_domain_ops *ops, 40 const struct irq_domain_ops *ops,
45 void *host_data) 41 void *host_data)
46{ 42{
47 struct irq_domain *domain; 43 struct irq_domain *domain;
48 44
49 domain = kzalloc_node(sizeof(*domain), GFP_KERNEL, 45 domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
50 of_node_to_nid(of_node)); 46 GFP_KERNEL, of_node_to_nid(of_node));
51 if (WARN_ON(!domain)) 47 if (WARN_ON(!domain))
52 return NULL; 48 return NULL;
53 49
54 /* Fill structure */ 50 /* Fill structure */
55 domain->revmap_type = revmap_type; 51 INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
56 domain->ops = ops; 52 domain->ops = ops;
57 domain->host_data = host_data; 53 domain->host_data = host_data;
58 domain->of_node = of_node_get(of_node); 54 domain->of_node = of_node_get(of_node);
55 domain->hwirq_max = hwirq_max;
56 domain->revmap_size = size;
57 domain->revmap_direct_max_irq = direct_max;
59 58
60 return domain;
61}
62
63static void irq_domain_free(struct irq_domain *domain)
64{
65 of_node_put(domain->of_node);
66 kfree(domain);
67}
68
69static void irq_domain_add(struct irq_domain *domain)
70{
71 mutex_lock(&irq_domain_mutex); 59 mutex_lock(&irq_domain_mutex);
72 list_add(&domain->link, &irq_domain_list); 60 list_add(&domain->link, &irq_domain_list);
73 mutex_unlock(&irq_domain_mutex); 61 mutex_unlock(&irq_domain_mutex);
74 pr_debug("Allocated domain of type %d @0x%p\n", 62
75 domain->revmap_type, domain); 63 pr_debug("Added domain %s\n", domain->name);
64 return domain;
76} 65}
66EXPORT_SYMBOL_GPL(__irq_domain_add);
77 67
78/** 68/**
79 * irq_domain_remove() - Remove an irq domain. 69 * irq_domain_remove() - Remove an irq domain.
@@ -87,29 +77,12 @@ void irq_domain_remove(struct irq_domain *domain)
87{ 77{
88 mutex_lock(&irq_domain_mutex); 78 mutex_lock(&irq_domain_mutex);
89 79
90 switch (domain->revmap_type) { 80 /*
91 case IRQ_DOMAIN_MAP_LEGACY: 81 * radix_tree_delete() takes care of destroying the root
92 /* 82 * node when all entries are removed. Shout if there are
93 * Legacy domains don't manage their own irq_desc 83 * any mappings left.
94 * allocations, we expect the caller to handle irq_desc 84 */
95 * freeing on their own. 85 WARN_ON(domain->revmap_tree.height);
96 */
97 break;
98 case IRQ_DOMAIN_MAP_TREE:
99 /*
100 * radix_tree_delete() takes care of destroying the root
101 * node when all entries are removed. Shout if there are
102 * any mappings left.
103 */
104 WARN_ON(domain->revmap_data.tree.height);
105 break;
106 case IRQ_DOMAIN_MAP_LINEAR:
107 kfree(domain->revmap_data.linear.revmap);
108 domain->revmap_data.linear.size = 0;
109 break;
110 case IRQ_DOMAIN_MAP_NOMAP:
111 break;
112 }
113 86
114 list_del(&domain->link); 87 list_del(&domain->link);
115 88
@@ -121,44 +94,30 @@ void irq_domain_remove(struct irq_domain *domain)
121 94
122 mutex_unlock(&irq_domain_mutex); 95 mutex_unlock(&irq_domain_mutex);
123 96
124 pr_debug("Removed domain of type %d @0x%p\n", 97 pr_debug("Removed domain %s\n", domain->name);
125 domain->revmap_type, domain);
126 98
127 irq_domain_free(domain); 99 of_node_put(domain->of_node);
100 kfree(domain);
128} 101}
129EXPORT_SYMBOL_GPL(irq_domain_remove); 102EXPORT_SYMBOL_GPL(irq_domain_remove);
130 103
131static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
132 irq_hw_number_t hwirq)
133{
134 irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq;
135 int size = domain->revmap_data.legacy.size;
136
137 if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size))
138 return 0;
139 return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq;
140}
141
142/** 104/**
143 * irq_domain_add_simple() - Allocate and register a simple irq_domain. 105 * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs
144 * @of_node: pointer to interrupt controller's device tree node. 106 * @of_node: pointer to interrupt controller's device tree node.
145 * @size: total number of irqs in mapping 107 * @size: total number of irqs in mapping
146 * @first_irq: first number of irq block assigned to the domain, 108 * @first_irq: first number of irq block assigned to the domain,
147 * pass zero to assign irqs on-the-fly. This will result in a 109 * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then
148 * linear IRQ domain so it is important to use irq_create_mapping() 110 * pre-map all of the irqs in the domain to virqs starting at first_irq.
149 * for each used IRQ, especially when SPARSE_IRQ is enabled.
150 * @ops: map/unmap domain callbacks 111 * @ops: map/unmap domain callbacks
151 * @host_data: Controller private data pointer 112 * @host_data: Controller private data pointer
152 * 113 *
153 * Allocates a legacy irq_domain if irq_base is positive or a linear 114 * Allocates an irq_domain, and optionally if first_irq is positive then also
154 * domain otherwise. For the legacy domain, IRQ descriptors will also 115 * allocate irq_descs and map all of the hwirqs to virqs starting at first_irq.
155 * be allocated.
156 * 116 *
157 * This is intended to implement the expected behaviour for most 117 * This is intended to implement the expected behaviour for most
158 * interrupt controllers which is that a linear mapping should 118 * interrupt controllers. If device tree is used, then first_irq will be 0 and
159 * normally be used unless the system requires a legacy mapping in 119 * irqs get mapped dynamically on the fly. However, if the controller requires
160 * order to support supplying interrupt numbers during non-DT 120 * static virq assignments (non-DT boot) then it will set that up correctly.
161 * registration of devices.
162 */ 121 */
163struct irq_domain *irq_domain_add_simple(struct device_node *of_node, 122struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
164 unsigned int size, 123 unsigned int size,
@@ -166,33 +125,25 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
166 const struct irq_domain_ops *ops, 125 const struct irq_domain_ops *ops,
167 void *host_data) 126 void *host_data)
168{ 127{
169 if (first_irq > 0) { 128 struct irq_domain *domain;
170 int irq_base; 129
130 domain = __irq_domain_add(of_node, size, size, 0, ops, host_data);
131 if (!domain)
132 return NULL;
171 133
134 if (first_irq > 0) {
172 if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { 135 if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
173 /* 136 /* attempt to allocated irq_descs */
174 * Set the descriptor allocator to search for a 137 int rc = irq_alloc_descs(first_irq, first_irq, size,
175 * 1-to-1 mapping, such as irq_alloc_desc_at(). 138 of_node_to_nid(of_node));
176 * Use of_node_to_nid() which is defined to 139 if (rc < 0)
177 * numa_node_id() on platforms that have no custom
178 * implementation.
179 */
180 irq_base = irq_alloc_descs(first_irq, first_irq, size,
181 of_node_to_nid(of_node));
182 if (irq_base < 0) {
183 pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", 140 pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
184 first_irq); 141 first_irq);
185 irq_base = first_irq; 142 }
186 } 143 irq_domain_associate_many(domain, first_irq, 0, size);
187 } else
188 irq_base = first_irq;
189
190 return irq_domain_add_legacy(of_node, size, irq_base, 0,
191 ops, host_data);
192 } 144 }
193 145
194 /* A linear domain is the default */ 146 return domain;
195 return irq_domain_add_linear(of_node, size, ops, host_data);
196} 147}
197EXPORT_SYMBOL_GPL(irq_domain_add_simple); 148EXPORT_SYMBOL_GPL(irq_domain_add_simple);
198 149
@@ -219,131 +170,19 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
219 void *host_data) 170 void *host_data)
220{ 171{
221 struct irq_domain *domain; 172 struct irq_domain *domain;
222 unsigned int i;
223 173
224 domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data); 174 domain = __irq_domain_add(of_node, first_hwirq + size,
175 first_hwirq + size, 0, ops, host_data);
225 if (!domain) 176 if (!domain)
226 return NULL; 177 return NULL;
227 178
228 domain->revmap_data.legacy.first_irq = first_irq; 179 irq_domain_associate_many(domain, first_irq, first_hwirq, size);
229 domain->revmap_data.legacy.first_hwirq = first_hwirq;
230 domain->revmap_data.legacy.size = size;
231
232 mutex_lock(&irq_domain_mutex);
233 /* Verify that all the irqs are available */
234 for (i = 0; i < size; i++) {
235 int irq = first_irq + i;
236 struct irq_data *irq_data = irq_get_irq_data(irq);
237 180
238 if (WARN_ON(!irq_data || irq_data->domain)) {
239 mutex_unlock(&irq_domain_mutex);
240 irq_domain_free(domain);
241 return NULL;
242 }
243 }
244
245 /* Claim all of the irqs before registering a legacy domain */
246 for (i = 0; i < size; i++) {
247 struct irq_data *irq_data = irq_get_irq_data(first_irq + i);
248 irq_data->hwirq = first_hwirq + i;
249 irq_data->domain = domain;
250 }
251 mutex_unlock(&irq_domain_mutex);
252
253 for (i = 0; i < size; i++) {
254 int irq = first_irq + i;
255 int hwirq = first_hwirq + i;
256
257 /* IRQ0 gets ignored */
258 if (!irq)
259 continue;
260
261 /* Legacy flags are left to default at this point,
262 * one can then use irq_create_mapping() to
263 * explicitly change them
264 */
265 if (ops->map)
266 ops->map(domain, irq, hwirq);
267
268 /* Clear norequest flags */
269 irq_clear_status_flags(irq, IRQ_NOREQUEST);
270 }
271
272 irq_domain_add(domain);
273 return domain; 181 return domain;
274} 182}
275EXPORT_SYMBOL_GPL(irq_domain_add_legacy); 183EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
276 184
277/** 185/**
278 * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain.
279 * @of_node: pointer to interrupt controller's device tree node.
280 * @size: Number of interrupts in the domain.
281 * @ops: map/unmap domain callbacks
282 * @host_data: Controller private data pointer
283 */
284struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
285 unsigned int size,
286 const struct irq_domain_ops *ops,
287 void *host_data)
288{
289 struct irq_domain *domain;
290 unsigned int *revmap;
291
292 revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL,
293 of_node_to_nid(of_node));
294 if (WARN_ON(!revmap))
295 return NULL;
296
297 domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data);
298 if (!domain) {
299 kfree(revmap);
300 return NULL;
301 }
302 domain->revmap_data.linear.size = size;
303 domain->revmap_data.linear.revmap = revmap;
304 irq_domain_add(domain);
305 return domain;
306}
307EXPORT_SYMBOL_GPL(irq_domain_add_linear);
308
309struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
310 unsigned int max_irq,
311 const struct irq_domain_ops *ops,
312 void *host_data)
313{
314 struct irq_domain *domain = irq_domain_alloc(of_node,
315 IRQ_DOMAIN_MAP_NOMAP, ops, host_data);
316 if (domain) {
317 domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0;
318 irq_domain_add(domain);
319 }
320 return domain;
321}
322EXPORT_SYMBOL_GPL(irq_domain_add_nomap);
323
324/**
325 * irq_domain_add_tree()
326 * @of_node: pointer to interrupt controller's device tree node.
327 * @ops: map/unmap domain callbacks
328 *
329 * Note: The radix tree will be allocated later during boot automatically
330 * (the reverse mapping will use the slow path until that happens).
331 */
332struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
333 const struct irq_domain_ops *ops,
334 void *host_data)
335{
336 struct irq_domain *domain = irq_domain_alloc(of_node,
337 IRQ_DOMAIN_MAP_TREE, ops, host_data);
338 if (domain) {
339 INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL);
340 irq_domain_add(domain);
341 }
342 return domain;
343}
344EXPORT_SYMBOL_GPL(irq_domain_add_tree);
345
346/**
347 * irq_find_host() - Locates a domain for a given device node 186 * irq_find_host() - Locates a domain for a given device node
348 * @node: device-tree node of the interrupt controller 187 * @node: device-tree node of the interrupt controller
349 */ 188 */
@@ -391,125 +230,108 @@ void irq_set_default_host(struct irq_domain *domain)
391} 230}
392EXPORT_SYMBOL_GPL(irq_set_default_host); 231EXPORT_SYMBOL_GPL(irq_set_default_host);
393 232
394static void irq_domain_disassociate_many(struct irq_domain *domain, 233static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
395 unsigned int irq_base, int count)
396{ 234{
397 /* 235 struct irq_data *irq_data = irq_get_irq_data(irq);
398 * disassociate in reverse order; 236 irq_hw_number_t hwirq;
399 * not strictly necessary, but nice for unwinding
400 */
401 while (count--) {
402 int irq = irq_base + count;
403 struct irq_data *irq_data = irq_get_irq_data(irq);
404 irq_hw_number_t hwirq;
405 237
406 if (WARN_ON(!irq_data || irq_data->domain != domain)) 238 if (WARN(!irq_data || irq_data->domain != domain,
407 continue; 239 "virq%i doesn't exist; cannot disassociate\n", irq))
240 return;
408 241
409 hwirq = irq_data->hwirq; 242 hwirq = irq_data->hwirq;
410 irq_set_status_flags(irq, IRQ_NOREQUEST); 243 irq_set_status_flags(irq, IRQ_NOREQUEST);
411 244
412 /* remove chip and handler */ 245 /* remove chip and handler */
413 irq_set_chip_and_handler(irq, NULL, NULL); 246 irq_set_chip_and_handler(irq, NULL, NULL);
414 247
415 /* Make sure it's completed */ 248 /* Make sure it's completed */
416 synchronize_irq(irq); 249 synchronize_irq(irq);
417 250
418 /* Tell the PIC about it */ 251 /* Tell the PIC about it */
419 if (domain->ops->unmap) 252 if (domain->ops->unmap)
420 domain->ops->unmap(domain, irq); 253 domain->ops->unmap(domain, irq);
421 smp_mb(); 254 smp_mb();
422 255
423 irq_data->domain = NULL; 256 irq_data->domain = NULL;
424 irq_data->hwirq = 0; 257 irq_data->hwirq = 0;
425 258
426 /* Clear reverse map */ 259 /* Clear reverse map for this hwirq */
427 switch(domain->revmap_type) { 260 if (hwirq < domain->revmap_size) {
428 case IRQ_DOMAIN_MAP_LINEAR: 261 domain->linear_revmap[hwirq] = 0;
429 if (hwirq < domain->revmap_data.linear.size) 262 } else {
430 domain->revmap_data.linear.revmap[hwirq] = 0; 263 mutex_lock(&revmap_trees_mutex);
431 break; 264 radix_tree_delete(&domain->revmap_tree, hwirq);
432 case IRQ_DOMAIN_MAP_TREE: 265 mutex_unlock(&revmap_trees_mutex);
433 mutex_lock(&revmap_trees_mutex);
434 radix_tree_delete(&domain->revmap_data.tree, hwirq);
435 mutex_unlock(&revmap_trees_mutex);
436 break;
437 }
438 } 266 }
439} 267}
440 268
441int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, 269int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
442 irq_hw_number_t hwirq_base, int count) 270 irq_hw_number_t hwirq)
443{ 271{
444 unsigned int virq = irq_base; 272 struct irq_data *irq_data = irq_get_irq_data(virq);
445 irq_hw_number_t hwirq = hwirq_base; 273 int ret;
446 int i, ret;
447 274
448 pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, 275 if (WARN(hwirq >= domain->hwirq_max,
449 of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); 276 "error: hwirq 0x%x is too large for %s\n", (int)hwirq, domain->name))
277 return -EINVAL;
278 if (WARN(!irq_data, "error: virq%i is not allocated", virq))
279 return -EINVAL;
280 if (WARN(irq_data->domain, "error: virq%i is already associated", virq))
281 return -EINVAL;
450 282
451 for (i = 0; i < count; i++) { 283 mutex_lock(&irq_domain_mutex);
452 struct irq_data *irq_data = irq_get_irq_data(virq + i); 284 irq_data->hwirq = hwirq;
453 285 irq_data->domain = domain;
454 if (WARN(!irq_data, "error: irq_desc not allocated; " 286 if (domain->ops->map) {
455 "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) 287 ret = domain->ops->map(domain, virq, hwirq);
456 return -EINVAL; 288 if (ret != 0) {
457 if (WARN(irq_data->domain, "error: irq_desc already associated; " 289 /*
458 "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) 290 * If map() returns -EPERM, this interrupt is protected
459 return -EINVAL; 291 * by the firmware or some other service and shall not
460 }; 292 * be mapped. Don't bother telling the user about it.
461 293 */
462 for (i = 0; i < count; i++, virq++, hwirq++) { 294 if (ret != -EPERM) {
463 struct irq_data *irq_data = irq_get_irq_data(virq); 295 pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n",
464 296 domain->name, hwirq, virq, ret);
465 irq_data->hwirq = hwirq;
466 irq_data->domain = domain;
467 if (domain->ops->map) {
468 ret = domain->ops->map(domain, virq, hwirq);
469 if (ret != 0) {
470 /*
471 * If map() returns -EPERM, this interrupt is protected
472 * by the firmware or some other service and shall not
473 * be mapped.
474 *
475 * Since on some platforms we blindly try to map everything
476 * we end up with a log full of backtraces.
477 *
478 * So instead, we silently fail on -EPERM, it is the
479 * responsibility of the PIC driver to display a relevant
480 * message if needed.
481 */
482 if (ret != -EPERM) {
483 pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
484 virq, hwirq, ret);
485 WARN_ON(1);
486 }
487 irq_data->domain = NULL;
488 irq_data->hwirq = 0;
489 goto err_unmap;
490 } 297 }
298 irq_data->domain = NULL;
299 irq_data->hwirq = 0;
300 mutex_unlock(&irq_domain_mutex);
301 return ret;
491 } 302 }
492 303
493 switch (domain->revmap_type) { 304 /* If not already assigned, give the domain the chip's name */
494 case IRQ_DOMAIN_MAP_LINEAR: 305 if (!domain->name && irq_data->chip)
495 if (hwirq < domain->revmap_data.linear.size) 306 domain->name = irq_data->chip->name;
496 domain->revmap_data.linear.revmap[hwirq] = virq; 307 }
497 break;
498 case IRQ_DOMAIN_MAP_TREE:
499 mutex_lock(&revmap_trees_mutex);
500 radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
501 mutex_unlock(&revmap_trees_mutex);
502 break;
503 }
504 308
505 irq_clear_status_flags(virq, IRQ_NOREQUEST); 309 if (hwirq < domain->revmap_size) {
310 domain->linear_revmap[hwirq] = virq;
311 } else {
312 mutex_lock(&revmap_trees_mutex);
313 radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
314 mutex_unlock(&revmap_trees_mutex);
506 } 315 }
316 mutex_unlock(&irq_domain_mutex);
317
318 irq_clear_status_flags(virq, IRQ_NOREQUEST);
507 319
508 return 0; 320 return 0;
321}
322EXPORT_SYMBOL_GPL(irq_domain_associate);
509 323
510 err_unmap: 324void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
511 irq_domain_disassociate_many(domain, irq_base, i); 325 irq_hw_number_t hwirq_base, int count)
512 return -EINVAL; 326{
327 int i;
328
329 pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
330 of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count);
331
332 for (i = 0; i < count; i++) {
333 irq_domain_associate(domain, irq_base + i, hwirq_base + i);
334 }
513} 335}
514EXPORT_SYMBOL_GPL(irq_domain_associate_many); 336EXPORT_SYMBOL_GPL(irq_domain_associate_many);
515 337
@@ -519,7 +341,9 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many);
519 * 341 *
520 * This routine is used for irq controllers which can choose the hardware 342 * This routine is used for irq controllers which can choose the hardware
521 * interrupt numbers they generate. In such a case it's simplest to use 343 * interrupt numbers they generate. In such a case it's simplest to use
522 * the linux irq as the hardware interrupt number. 344 * the linux irq as the hardware interrupt number. It still uses the linear
345 * or radix tree to store the mapping, but the irq controller can optimize
346 * the revmap path by using the hwirq directly.
523 */ 347 */
524unsigned int irq_create_direct_mapping(struct irq_domain *domain) 348unsigned int irq_create_direct_mapping(struct irq_domain *domain)
525{ 349{
@@ -528,17 +352,14 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
528 if (domain == NULL) 352 if (domain == NULL)
529 domain = irq_default_domain; 353 domain = irq_default_domain;
530 354
531 if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP))
532 return 0;
533
534 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); 355 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
535 if (!virq) { 356 if (!virq) {
536 pr_debug("create_direct virq allocation failed\n"); 357 pr_debug("create_direct virq allocation failed\n");
537 return 0; 358 return 0;
538 } 359 }
539 if (virq >= domain->revmap_data.nomap.max_irq) { 360 if (virq >= domain->revmap_direct_max_irq) {
540 pr_err("ERROR: no free irqs available below %i maximum\n", 361 pr_err("ERROR: no free irqs available below %i maximum\n",
541 domain->revmap_data.nomap.max_irq); 362 domain->revmap_direct_max_irq);
542 irq_free_desc(virq); 363 irq_free_desc(virq);
543 return 0; 364 return 0;
544 } 365 }
@@ -575,9 +396,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
575 if (domain == NULL) 396 if (domain == NULL)
576 domain = irq_default_domain; 397 domain = irq_default_domain;
577 if (domain == NULL) { 398 if (domain == NULL) {
578 pr_warning("irq_create_mapping called for" 399 WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq);
579 " NULL domain, hwirq=%lx\n", hwirq);
580 WARN_ON(1);
581 return 0; 400 return 0;
582 } 401 }
583 pr_debug("-> using domain @%p\n", domain); 402 pr_debug("-> using domain @%p\n", domain);
@@ -589,10 +408,6 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
589 return virq; 408 return virq;
590 } 409 }
591 410
592 /* Get a virtual interrupt number */
593 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
594 return irq_domain_legacy_revmap(domain, hwirq);
595
596 /* Allocate a virtual interrupt number */ 411 /* Allocate a virtual interrupt number */
597 hint = hwirq % nr_irqs; 412 hint = hwirq % nr_irqs;
598 if (hint == 0) 413 if (hint == 0)
@@ -645,12 +460,7 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
645 if (unlikely(ret < 0)) 460 if (unlikely(ret < 0))
646 return ret; 461 return ret;
647 462
648 ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count); 463 irq_domain_associate_many(domain, irq_base, hwirq_base, count);
649 if (unlikely(ret < 0)) {
650 irq_free_descs(irq_base, count);
651 return ret;
652 }
653
654 return 0; 464 return 0;
655} 465}
656EXPORT_SYMBOL_GPL(irq_create_strict_mappings); 466EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
@@ -677,8 +487,8 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
677 if (intsize > 0) 487 if (intsize > 0)
678 return intspec[0]; 488 return intspec[0];
679#endif 489#endif
680 pr_warning("no irq domain found for %s !\n", 490 pr_warn("no irq domain found for %s !\n",
681 of_node_full_name(controller)); 491 of_node_full_name(controller));
682 return 0; 492 return 0;
683 } 493 }
684 494
@@ -698,7 +508,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
698 508
699 /* Set type if specified and different than the current one */ 509 /* Set type if specified and different than the current one */
700 if (type != IRQ_TYPE_NONE && 510 if (type != IRQ_TYPE_NONE &&
701 type != (irqd_get_trigger_type(irq_get_irq_data(virq)))) 511 type != irq_get_trigger_type(virq))
702 irq_set_irq_type(virq, type); 512 irq_set_irq_type(virq, type);
703 return virq; 513 return virq;
704} 514}
@@ -720,11 +530,7 @@ void irq_dispose_mapping(unsigned int virq)
720 if (WARN_ON(domain == NULL)) 530 if (WARN_ON(domain == NULL))
721 return; 531 return;
722 532
723 /* Never unmap legacy interrupts */ 533 irq_domain_disassociate(domain, virq);
724 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
725 return;
726
727 irq_domain_disassociate_many(domain, virq, 1);
728 irq_free_desc(virq); 534 irq_free_desc(virq);
729} 535}
730EXPORT_SYMBOL_GPL(irq_dispose_mapping); 536EXPORT_SYMBOL_GPL(irq_dispose_mapping);
@@ -745,63 +551,51 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
745 if (domain == NULL) 551 if (domain == NULL)
746 return 0; 552 return 0;
747 553
748 switch (domain->revmap_type) { 554 if (hwirq < domain->revmap_direct_max_irq) {
749 case IRQ_DOMAIN_MAP_LEGACY:
750 return irq_domain_legacy_revmap(domain, hwirq);
751 case IRQ_DOMAIN_MAP_LINEAR:
752 return irq_linear_revmap(domain, hwirq);
753 case IRQ_DOMAIN_MAP_TREE:
754 rcu_read_lock();
755 data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
756 rcu_read_unlock();
757 if (data)
758 return data->irq;
759 break;
760 case IRQ_DOMAIN_MAP_NOMAP:
761 data = irq_get_irq_data(hwirq); 555 data = irq_get_irq_data(hwirq);
762 if (data && (data->domain == domain) && (data->hwirq == hwirq)) 556 if (data && (data->domain == domain) && (data->hwirq == hwirq))
763 return hwirq; 557 return hwirq;
764 break;
765 } 558 }
766 559
767 return 0; 560 /* Check if the hwirq is in the linear revmap. */
768} 561 if (hwirq < domain->revmap_size)
769EXPORT_SYMBOL_GPL(irq_find_mapping); 562 return domain->linear_revmap[hwirq];
770 563
771/** 564 rcu_read_lock();
772 * irq_linear_revmap() - Find a linux irq from a hw irq number. 565 data = radix_tree_lookup(&domain->revmap_tree, hwirq);
773 * @domain: domain owning this hardware interrupt 566 rcu_read_unlock();
774 * @hwirq: hardware irq number in that domain space 567 return data ? data->irq : 0;
775 *
776 * This is a fast path that can be called directly by irq controller code to
777 * save a handful of instructions.
778 */
779unsigned int irq_linear_revmap(struct irq_domain *domain,
780 irq_hw_number_t hwirq)
781{
782 BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR);
783
784 /* Check revmap bounds; complain if exceeded */
785 if (WARN_ON(hwirq >= domain->revmap_data.linear.size))
786 return 0;
787
788 return domain->revmap_data.linear.revmap[hwirq];
789} 568}
790EXPORT_SYMBOL_GPL(irq_linear_revmap); 569EXPORT_SYMBOL_GPL(irq_find_mapping);
791 570
792#ifdef CONFIG_IRQ_DOMAIN_DEBUG 571#ifdef CONFIG_IRQ_DOMAIN_DEBUG
793static int virq_debug_show(struct seq_file *m, void *private) 572static int virq_debug_show(struct seq_file *m, void *private)
794{ 573{
795 unsigned long flags; 574 unsigned long flags;
796 struct irq_desc *desc; 575 struct irq_desc *desc;
797 const char *p; 576 struct irq_domain *domain;
798 static const char none[] = "none"; 577 struct radix_tree_iter iter;
799 void *data; 578 void *data, **slot;
800 int i; 579 int i;
801 580
802 seq_printf(m, "%-5s %-7s %-15s %-*s %s\n", "irq", "hwirq", 581 seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
582 "name", "mapped", "linear-max", "direct-max", "devtree-node");
583 mutex_lock(&irq_domain_mutex);
584 list_for_each_entry(domain, &irq_domain_list, link) {
585 int count = 0;
586 radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0)
587 count++;
588 seq_printf(m, "%c%-16s %6u %10u %10u %s\n",
589 domain == irq_default_domain ? '*' : ' ', domain->name,
590 domain->revmap_size + count, domain->revmap_size,
591 domain->revmap_direct_max_irq,
592 domain->of_node ? of_node_full_name(domain->of_node) : "");
593 }
594 mutex_unlock(&irq_domain_mutex);
595
596 seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq",
803 "chip name", (int)(2 * sizeof(void *) + 2), "chip data", 597 "chip name", (int)(2 * sizeof(void *) + 2), "chip data",
804 "domain name"); 598 "active", "type", "domain");
805 599
806 for (i = 1; i < nr_irqs; i++) { 600 for (i = 1; i < nr_irqs; i++) {
807 desc = irq_to_desc(i); 601 desc = irq_to_desc(i);
@@ -809,28 +603,28 @@ static int virq_debug_show(struct seq_file *m, void *private)
809 continue; 603 continue;
810 604
811 raw_spin_lock_irqsave(&desc->lock, flags); 605 raw_spin_lock_irqsave(&desc->lock, flags);
606 domain = desc->irq_data.domain;
812 607
813 if (desc->action && desc->action->handler) { 608 if (domain) {
814 struct irq_chip *chip; 609 struct irq_chip *chip;
610 int hwirq = desc->irq_data.hwirq;
611 bool direct;
815 612
816 seq_printf(m, "%5d ", i); 613 seq_printf(m, "%5d ", i);
817 seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); 614 seq_printf(m, "0x%05x ", hwirq);
818 615
819 chip = irq_desc_get_chip(desc); 616 chip = irq_desc_get_chip(desc);
820 if (chip && chip->name) 617 seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
821 p = chip->name;
822 else
823 p = none;
824 seq_printf(m, "%-15s ", p);
825 618
826 data = irq_desc_get_chip_data(desc); 619 data = irq_desc_get_chip_data(desc);
827 seq_printf(m, data ? "0x%p " : " %p ", data); 620 seq_printf(m, data ? "0x%p " : " %p ", data);
828 621
829 if (desc->irq_data.domain) 622 seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
830 p = of_node_full_name(desc->irq_data.domain->of_node); 623 direct = (i == hwirq) && (i < domain->revmap_direct_max_irq);
831 else 624 seq_printf(m, "%6s%-8s ",
832 p = none; 625 (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
833 seq_printf(m, "%s\n", p); 626 direct ? "(DIRECT)" : "");
627 seq_printf(m, "%s\n", desc->irq_data.domain->name);
834 } 628 }
835 629
836 raw_spin_unlock_irqrestore(&desc->lock, flags); 630 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -927,18 +721,3 @@ const struct irq_domain_ops irq_domain_simple_ops = {
927 .xlate = irq_domain_xlate_onetwocell, 721 .xlate = irq_domain_xlate_onetwocell,
928}; 722};
929EXPORT_SYMBOL_GPL(irq_domain_simple_ops); 723EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
930
931#ifdef CONFIG_OF_IRQ
932void irq_domain_generate_simple(const struct of_device_id *match,
933 u64 phys_base, unsigned int irq_start)
934{
935 struct device_node *node;
936 pr_debug("looking for phys_base=%llx, irq_start=%i\n",
937 (unsigned long long) phys_base, (int) irq_start);
938 node = of_find_matching_node_by_address(NULL, match, phys_base);
939 if (node)
940 irq_domain_add_legacy(node, 32, irq_start, 0,
941 &irq_domain_simple_ops, NULL);
942}
943EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
944#endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index fa17855ca65a..514bcfd855a8 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -555,9 +555,9 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
555 return 0; 555 return 0;
556 556
557 if (irq_settings_can_request(desc)) { 557 if (irq_settings_can_request(desc)) {
558 if (desc->action) 558 if (!desc->action ||
559 if (irqflags & desc->action->flags & IRQF_SHARED) 559 irqflags & desc->action->flags & IRQF_SHARED)
560 canrequest =1; 560 canrequest = 1;
561 } 561 }
562 irq_put_desc_unlock(desc, flags); 562 irq_put_desc_unlock(desc, flags);
563 return canrequest; 563 return canrequest;
@@ -840,9 +840,6 @@ static void irq_thread_dtor(struct callback_head *unused)
840static int irq_thread(void *data) 840static int irq_thread(void *data)
841{ 841{
842 struct callback_head on_exit_work; 842 struct callback_head on_exit_work;
843 static const struct sched_param param = {
844 .sched_priority = MAX_USER_RT_PRIO/2,
845 };
846 struct irqaction *action = data; 843 struct irqaction *action = data;
847 struct irq_desc *desc = irq_to_desc(action->irq); 844 struct irq_desc *desc = irq_to_desc(action->irq);
848 irqreturn_t (*handler_fn)(struct irq_desc *desc, 845 irqreturn_t (*handler_fn)(struct irq_desc *desc,
@@ -854,8 +851,6 @@ static int irq_thread(void *data)
854 else 851 else
855 handler_fn = irq_thread_fn; 852 handler_fn = irq_thread_fn;
856 853
857 sched_setscheduler(current, SCHED_FIFO, &param);
858
859 init_task_work(&on_exit_work, irq_thread_dtor); 854 init_task_work(&on_exit_work, irq_thread_dtor);
860 task_work_add(current, &on_exit_work, false); 855 task_work_add(current, &on_exit_work, false);
861 856
@@ -950,6 +945,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
950 */ 945 */
951 if (new->thread_fn && !nested) { 946 if (new->thread_fn && !nested) {
952 struct task_struct *t; 947 struct task_struct *t;
948 static const struct sched_param param = {
949 .sched_priority = MAX_USER_RT_PRIO/2,
950 };
953 951
954 t = kthread_create(irq_thread, new, "irq/%d-%s", irq, 952 t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
955 new->name); 953 new->name);
@@ -957,6 +955,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
957 ret = PTR_ERR(t); 955 ret = PTR_ERR(t);
958 goto out_mput; 956 goto out_mput;
959 } 957 }
958
959 sched_setscheduler(t, SCHED_FIFO, &param);
960
960 /* 961 /*
961 * We keep the reference to the task struct even if 962 * We keep the reference to the task struct even if
962 * the thread dies to avoid that the interrupt code 963 * the thread dies to avoid that the interrupt code
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 19ed5c425c3b..36f6ee181b0c 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -462,6 +462,8 @@ int show_interrupts(struct seq_file *p, void *v)
462 } else { 462 } else {
463 seq_printf(p, " %8s", "None"); 463 seq_printf(p, " %8s", "None");
464 } 464 }
465 if (desc->irq_data.domain)
466 seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);
465#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL 467#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
466 seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); 468 seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
467#endif 469#endif
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8241906c4b61..fb326365b694 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -147,6 +147,9 @@ int __request_module(bool wait, const char *fmt, ...)
147 */ 147 */
148 WARN_ON_ONCE(wait && current_is_async()); 148 WARN_ON_ONCE(wait && current_is_async());
149 149
150 if (!modprobe_path[0])
151 return 0;
152
150 va_start(args, fmt); 153 va_start(args, fmt);
151 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 154 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
152 va_end(args); 155 va_end(args);
@@ -569,14 +572,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
569 int retval = 0; 572 int retval = 0;
570 573
571 helper_lock(); 574 helper_lock();
572 if (!sub_info->path) {
573 retval = -EINVAL;
574 goto out;
575 }
576
577 if (sub_info->path[0] == '\0')
578 goto out;
579
580 if (!khelper_wq || usermodehelper_disabled) { 575 if (!khelper_wq || usermodehelper_disabled) {
581 retval = -EBUSY; 576 retval = -EBUSY;
582 goto out; 577 goto out;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3fed7f0cbcdf..6e33498d665c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -467,6 +467,7 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
467/* Optimization staging list, protected by kprobe_mutex */ 467/* Optimization staging list, protected by kprobe_mutex */
468static LIST_HEAD(optimizing_list); 468static LIST_HEAD(optimizing_list);
469static LIST_HEAD(unoptimizing_list); 469static LIST_HEAD(unoptimizing_list);
470static LIST_HEAD(freeing_list);
470 471
471static void kprobe_optimizer(struct work_struct *work); 472static void kprobe_optimizer(struct work_struct *work);
472static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); 473static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
@@ -504,7 +505,7 @@ static __kprobes void do_optimize_kprobes(void)
504 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint 505 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
505 * if need) kprobes listed on unoptimizing_list. 506 * if need) kprobes listed on unoptimizing_list.
506 */ 507 */
507static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) 508static __kprobes void do_unoptimize_kprobes(void)
508{ 509{
509 struct optimized_kprobe *op, *tmp; 510 struct optimized_kprobe *op, *tmp;
510 511
@@ -515,9 +516,9 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
515 /* Ditto to do_optimize_kprobes */ 516 /* Ditto to do_optimize_kprobes */
516 get_online_cpus(); 517 get_online_cpus();
517 mutex_lock(&text_mutex); 518 mutex_lock(&text_mutex);
518 arch_unoptimize_kprobes(&unoptimizing_list, free_list); 519 arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
519 /* Loop free_list for disarming */ 520 /* Loop free_list for disarming */
520 list_for_each_entry_safe(op, tmp, free_list, list) { 521 list_for_each_entry_safe(op, tmp, &freeing_list, list) {
521 /* Disarm probes if marked disabled */ 522 /* Disarm probes if marked disabled */
522 if (kprobe_disabled(&op->kp)) 523 if (kprobe_disabled(&op->kp))
523 arch_disarm_kprobe(&op->kp); 524 arch_disarm_kprobe(&op->kp);
@@ -536,11 +537,11 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
536} 537}
537 538
538/* Reclaim all kprobes on the free_list */ 539/* Reclaim all kprobes on the free_list */
539static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) 540static __kprobes void do_free_cleaned_kprobes(void)
540{ 541{
541 struct optimized_kprobe *op, *tmp; 542 struct optimized_kprobe *op, *tmp;
542 543
543 list_for_each_entry_safe(op, tmp, free_list, list) { 544 list_for_each_entry_safe(op, tmp, &freeing_list, list) {
544 BUG_ON(!kprobe_unused(&op->kp)); 545 BUG_ON(!kprobe_unused(&op->kp));
545 list_del_init(&op->list); 546 list_del_init(&op->list);
546 free_aggr_kprobe(&op->kp); 547 free_aggr_kprobe(&op->kp);
@@ -556,8 +557,6 @@ static __kprobes void kick_kprobe_optimizer(void)
556/* Kprobe jump optimizer */ 557/* Kprobe jump optimizer */
557static __kprobes void kprobe_optimizer(struct work_struct *work) 558static __kprobes void kprobe_optimizer(struct work_struct *work)
558{ 559{
559 LIST_HEAD(free_list);
560
561 mutex_lock(&kprobe_mutex); 560 mutex_lock(&kprobe_mutex);
562 /* Lock modules while optimizing kprobes */ 561 /* Lock modules while optimizing kprobes */
563 mutex_lock(&module_mutex); 562 mutex_lock(&module_mutex);
@@ -566,7 +565,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
566 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) 565 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
567 * kprobes before waiting for quiesence period. 566 * kprobes before waiting for quiesence period.
568 */ 567 */
569 do_unoptimize_kprobes(&free_list); 568 do_unoptimize_kprobes();
570 569
571 /* 570 /*
572 * Step 2: Wait for quiesence period to ensure all running interrupts 571 * Step 2: Wait for quiesence period to ensure all running interrupts
@@ -581,7 +580,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
581 do_optimize_kprobes(); 580 do_optimize_kprobes();
582 581
583 /* Step 4: Free cleaned kprobes after quiesence period */ 582 /* Step 4: Free cleaned kprobes after quiesence period */
584 do_free_cleaned_kprobes(&free_list); 583 do_free_cleaned_kprobes();
585 584
586 mutex_unlock(&module_mutex); 585 mutex_unlock(&module_mutex);
587 mutex_unlock(&kprobe_mutex); 586 mutex_unlock(&kprobe_mutex);
@@ -723,8 +722,19 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p)
723 if (!list_empty(&op->list)) 722 if (!list_empty(&op->list))
724 /* Dequeue from the (un)optimization queue */ 723 /* Dequeue from the (un)optimization queue */
725 list_del_init(&op->list); 724 list_del_init(&op->list);
726
727 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; 725 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
726
727 if (kprobe_unused(p)) {
728 /* Enqueue if it is unused */
729 list_add(&op->list, &freeing_list);
730 /*
731 * Remove unused probes from the hash list. After waiting
732 * for synchronization, this probe is reclaimed.
733 * (reclaiming is done by do_free_cleaned_kprobes().)
734 */
735 hlist_del_rcu(&op->kp.hlist);
736 }
737
728 /* Don't touch the code, because it is already freed. */ 738 /* Don't touch the code, because it is already freed. */
729 arch_remove_optimized_kprobe(op); 739 arch_remove_optimized_kprobe(op);
730} 740}
@@ -2322,6 +2332,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
2322 if (copy_from_user(buf, user_buf, buf_size)) 2332 if (copy_from_user(buf, user_buf, buf_size))
2323 return -EFAULT; 2333 return -EFAULT;
2324 2334
2335 buf[buf_size] = '\0';
2325 switch (buf[0]) { 2336 switch (buf[0]) {
2326 case 'y': 2337 case 'y':
2327 case 'Y': 2338 case 'Y':
@@ -2333,6 +2344,8 @@ static ssize_t write_enabled_file_bool(struct file *file,
2333 case '0': 2344 case '0':
2334 disarm_all_kprobes(); 2345 disarm_all_kprobes();
2335 break; 2346 break;
2347 default:
2348 return -EINVAL;
2336 } 2349 }
2337 2350
2338 return count; 2351 return count;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1f3186b37fd5..e16c45b9ee77 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -4090,7 +4090,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
4090} 4090}
4091EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); 4091EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
4092 4092
4093static void print_held_locks_bug(struct task_struct *curr) 4093static void print_held_locks_bug(void)
4094{ 4094{
4095 if (!debug_locks_off()) 4095 if (!debug_locks_off())
4096 return; 4096 return;
@@ -4099,22 +4099,21 @@ static void print_held_locks_bug(struct task_struct *curr)
4099 4099
4100 printk("\n"); 4100 printk("\n");
4101 printk("=====================================\n"); 4101 printk("=====================================\n");
4102 printk("[ BUG: lock held at task exit time! ]\n"); 4102 printk("[ BUG: %s/%d still has locks held! ]\n",
4103 current->comm, task_pid_nr(current));
4103 print_kernel_ident(); 4104 print_kernel_ident();
4104 printk("-------------------------------------\n"); 4105 printk("-------------------------------------\n");
4105 printk("%s/%d is exiting with locks still held!\n", 4106 lockdep_print_held_locks(current);
4106 curr->comm, task_pid_nr(curr));
4107 lockdep_print_held_locks(curr);
4108
4109 printk("\nstack backtrace:\n"); 4107 printk("\nstack backtrace:\n");
4110 dump_stack(); 4108 dump_stack();
4111} 4109}
4112 4110
4113void debug_check_no_locks_held(struct task_struct *task) 4111void debug_check_no_locks_held(void)
4114{ 4112{
4115 if (unlikely(task->lockdep_depth > 0)) 4113 if (unlikely(current->lockdep_depth > 0))
4116 print_held_locks_bug(task); 4114 print_held_locks_bug();
4117} 4115}
4116EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
4118 4117
4119void debug_show_all_locks(void) 4118void debug_show_all_locks(void)
4120{ 4119{
diff --git a/kernel/mutex.c b/kernel/mutex.c
index ad53a664f113..e581ada5faf4 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -254,16 +254,165 @@ void __sched mutex_unlock(struct mutex *lock)
254 254
255EXPORT_SYMBOL(mutex_unlock); 255EXPORT_SYMBOL(mutex_unlock);
256 256
257/**
258 * ww_mutex_unlock - release the w/w mutex
259 * @lock: the mutex to be released
260 *
261 * Unlock a mutex that has been locked by this task previously with any of the
262 * ww_mutex_lock* functions (with or without an acquire context). It is
263 * forbidden to release the locks after releasing the acquire context.
264 *
265 * This function must not be used in interrupt context. Unlocking
266 * of a unlocked mutex is not allowed.
267 */
268void __sched ww_mutex_unlock(struct ww_mutex *lock)
269{
270 /*
271 * The unlocking fastpath is the 0->1 transition from 'locked'
272 * into 'unlocked' state:
273 */
274 if (lock->ctx) {
275#ifdef CONFIG_DEBUG_MUTEXES
276 DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
277#endif
278 if (lock->ctx->acquired > 0)
279 lock->ctx->acquired--;
280 lock->ctx = NULL;
281 }
282
283#ifndef CONFIG_DEBUG_MUTEXES
284 /*
285 * When debugging is enabled we must not clear the owner before time,
286 * the slow path will always be taken, and that clears the owner field
287 * after verifying that it was indeed current.
288 */
289 mutex_clear_owner(&lock->base);
290#endif
291 __mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath);
292}
293EXPORT_SYMBOL(ww_mutex_unlock);
294
295static inline int __sched
296__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
297{
298 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
299 struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
300
301 if (!hold_ctx)
302 return 0;
303
304 if (unlikely(ctx == hold_ctx))
305 return -EALREADY;
306
307 if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
308 (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
309#ifdef CONFIG_DEBUG_MUTEXES
310 DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
311 ctx->contending_lock = ww;
312#endif
313 return -EDEADLK;
314 }
315
316 return 0;
317}
318
319static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
320 struct ww_acquire_ctx *ww_ctx)
321{
322#ifdef CONFIG_DEBUG_MUTEXES
323 /*
324 * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
325 * but released with a normal mutex_unlock in this call.
326 *
327 * This should never happen, always use ww_mutex_unlock.
328 */
329 DEBUG_LOCKS_WARN_ON(ww->ctx);
330
331 /*
332 * Not quite done after calling ww_acquire_done() ?
333 */
334 DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
335
336 if (ww_ctx->contending_lock) {
337 /*
338 * After -EDEADLK you tried to
339 * acquire a different ww_mutex? Bad!
340 */
341 DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
342
343 /*
344 * You called ww_mutex_lock after receiving -EDEADLK,
345 * but 'forgot' to unlock everything else first?
346 */
347 DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
348 ww_ctx->contending_lock = NULL;
349 }
350
351 /*
352 * Naughty, using a different class will lead to undefined behavior!
353 */
354 DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
355#endif
356 ww_ctx->acquired++;
357}
358
359/*
360 * after acquiring lock with fastpath or when we lost out in contested
361 * slowpath, set ctx and wake up any waiters so they can recheck.
362 *
363 * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
364 * as the fastpath and opportunistic spinning are disabled in that case.
365 */
366static __always_inline void
367ww_mutex_set_context_fastpath(struct ww_mutex *lock,
368 struct ww_acquire_ctx *ctx)
369{
370 unsigned long flags;
371 struct mutex_waiter *cur;
372
373 ww_mutex_lock_acquired(lock, ctx);
374
375 lock->ctx = ctx;
376
377 /*
378 * The lock->ctx update should be visible on all cores before
379 * the atomic read is done, otherwise contended waiters might be
380 * missed. The contended waiters will either see ww_ctx == NULL
381 * and keep spinning, or it will acquire wait_lock, add itself
382 * to waiter list and sleep.
383 */
384 smp_mb(); /* ^^^ */
385
386 /*
387 * Check if lock is contended, if not there is nobody to wake up
388 */
389 if (likely(atomic_read(&lock->base.count) == 0))
390 return;
391
392 /*
393 * Uh oh, we raced in fastpath, wake up everyone in this case,
394 * so they can see the new lock->ctx.
395 */
396 spin_lock_mutex(&lock->base.wait_lock, flags);
397 list_for_each_entry(cur, &lock->base.wait_list, list) {
398 debug_mutex_wake_waiter(&lock->base, cur);
399 wake_up_process(cur->task);
400 }
401 spin_unlock_mutex(&lock->base.wait_lock, flags);
402}
403
257/* 404/*
258 * Lock a mutex (possibly interruptible), slowpath: 405 * Lock a mutex (possibly interruptible), slowpath:
259 */ 406 */
260static inline int __sched 407static __always_inline int __sched
261__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, 408__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
262 struct lockdep_map *nest_lock, unsigned long ip) 409 struct lockdep_map *nest_lock, unsigned long ip,
410 struct ww_acquire_ctx *ww_ctx)
263{ 411{
264 struct task_struct *task = current; 412 struct task_struct *task = current;
265 struct mutex_waiter waiter; 413 struct mutex_waiter waiter;
266 unsigned long flags; 414 unsigned long flags;
415 int ret;
267 416
268 preempt_disable(); 417 preempt_disable();
269 mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); 418 mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
@@ -298,6 +447,22 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
298 struct task_struct *owner; 447 struct task_struct *owner;
299 struct mspin_node node; 448 struct mspin_node node;
300 449
450 if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) {
451 struct ww_mutex *ww;
452
453 ww = container_of(lock, struct ww_mutex, base);
454 /*
455 * If ww->ctx is set the contents are undefined, only
456 * by acquiring wait_lock there is a guarantee that
457 * they are not invalid when reading.
458 *
459 * As such, when deadlock detection needs to be
460 * performed the optimistic spinning cannot be done.
461 */
462 if (ACCESS_ONCE(ww->ctx))
463 break;
464 }
465
301 /* 466 /*
302 * If there's an owner, wait for it to either 467 * If there's an owner, wait for it to either
303 * release the lock or go to sleep. 468 * release the lock or go to sleep.
@@ -312,6 +477,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
312 if ((atomic_read(&lock->count) == 1) && 477 if ((atomic_read(&lock->count) == 1) &&
313 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { 478 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
314 lock_acquired(&lock->dep_map, ip); 479 lock_acquired(&lock->dep_map, ip);
480 if (!__builtin_constant_p(ww_ctx == NULL)) {
481 struct ww_mutex *ww;
482 ww = container_of(lock, struct ww_mutex, base);
483
484 ww_mutex_set_context_fastpath(ww, ww_ctx);
485 }
486
315 mutex_set_owner(lock); 487 mutex_set_owner(lock);
316 mspin_unlock(MLOCK(lock), &node); 488 mspin_unlock(MLOCK(lock), &node);
317 preempt_enable(); 489 preempt_enable();
@@ -371,15 +543,16 @@ slowpath:
371 * TASK_UNINTERRUPTIBLE case.) 543 * TASK_UNINTERRUPTIBLE case.)
372 */ 544 */
373 if (unlikely(signal_pending_state(state, task))) { 545 if (unlikely(signal_pending_state(state, task))) {
374 mutex_remove_waiter(lock, &waiter, 546 ret = -EINTR;
375 task_thread_info(task)); 547 goto err;
376 mutex_release(&lock->dep_map, 1, ip); 548 }
377 spin_unlock_mutex(&lock->wait_lock, flags);
378 549
379 debug_mutex_free_waiter(&waiter); 550 if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) {
380 preempt_enable(); 551 ret = __mutex_lock_check_stamp(lock, ww_ctx);
381 return -EINTR; 552 if (ret)
553 goto err;
382 } 554 }
555
383 __set_task_state(task, state); 556 __set_task_state(task, state);
384 557
385 /* didn't get the lock, go to sleep: */ 558 /* didn't get the lock, go to sleep: */
@@ -394,6 +567,30 @@ done:
394 mutex_remove_waiter(lock, &waiter, current_thread_info()); 567 mutex_remove_waiter(lock, &waiter, current_thread_info());
395 mutex_set_owner(lock); 568 mutex_set_owner(lock);
396 569
570 if (!__builtin_constant_p(ww_ctx == NULL)) {
571 struct ww_mutex *ww = container_of(lock,
572 struct ww_mutex,
573 base);
574 struct mutex_waiter *cur;
575
576 /*
577 * This branch gets optimized out for the common case,
578 * and is only important for ww_mutex_lock.
579 */
580
581 ww_mutex_lock_acquired(ww, ww_ctx);
582 ww->ctx = ww_ctx;
583
584 /*
585 * Give any possible sleeping processes the chance to wake up,
586 * so they can recheck if they have to back off.
587 */
588 list_for_each_entry(cur, &lock->wait_list, list) {
589 debug_mutex_wake_waiter(lock, cur);
590 wake_up_process(cur->task);
591 }
592 }
593
397 /* set it to 0 if there are no waiters left: */ 594 /* set it to 0 if there are no waiters left: */
398 if (likely(list_empty(&lock->wait_list))) 595 if (likely(list_empty(&lock->wait_list)))
399 atomic_set(&lock->count, 0); 596 atomic_set(&lock->count, 0);
@@ -404,6 +601,14 @@ done:
404 preempt_enable(); 601 preempt_enable();
405 602
406 return 0; 603 return 0;
604
605err:
606 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
607 spin_unlock_mutex(&lock->wait_lock, flags);
608 debug_mutex_free_waiter(&waiter);
609 mutex_release(&lock->dep_map, 1, ip);
610 preempt_enable();
611 return ret;
407} 612}
408 613
409#ifdef CONFIG_DEBUG_LOCK_ALLOC 614#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -411,7 +616,8 @@ void __sched
411mutex_lock_nested(struct mutex *lock, unsigned int subclass) 616mutex_lock_nested(struct mutex *lock, unsigned int subclass)
412{ 617{
413 might_sleep(); 618 might_sleep();
414 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); 619 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
620 subclass, NULL, _RET_IP_, NULL);
415} 621}
416 622
417EXPORT_SYMBOL_GPL(mutex_lock_nested); 623EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -420,7 +626,8 @@ void __sched
420_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) 626_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
421{ 627{
422 might_sleep(); 628 might_sleep();
423 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); 629 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
630 0, nest, _RET_IP_, NULL);
424} 631}
425 632
426EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); 633EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
@@ -429,7 +636,8 @@ int __sched
429mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) 636mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
430{ 637{
431 might_sleep(); 638 might_sleep();
432 return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); 639 return __mutex_lock_common(lock, TASK_KILLABLE,
640 subclass, NULL, _RET_IP_, NULL);
433} 641}
434EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); 642EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
435 643
@@ -438,10 +646,68 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
438{ 646{
439 might_sleep(); 647 might_sleep();
440 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 648 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
441 subclass, NULL, _RET_IP_); 649 subclass, NULL, _RET_IP_, NULL);
442} 650}
443 651
444EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); 652EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
653
654static inline int
655ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
656{
657#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
658 unsigned tmp;
659
660 if (ctx->deadlock_inject_countdown-- == 0) {
661 tmp = ctx->deadlock_inject_interval;
662 if (tmp > UINT_MAX/4)
663 tmp = UINT_MAX;
664 else
665 tmp = tmp*2 + tmp + tmp/2;
666
667 ctx->deadlock_inject_interval = tmp;
668 ctx->deadlock_inject_countdown = tmp;
669 ctx->contending_lock = lock;
670
671 ww_mutex_unlock(lock);
672
673 return -EDEADLK;
674 }
675#endif
676
677 return 0;
678}
679
680int __sched
681__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
682{
683 int ret;
684
685 might_sleep();
686 ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
687 0, &ctx->dep_map, _RET_IP_, ctx);
688 if (!ret && ctx->acquired > 0)
689 return ww_mutex_deadlock_injection(lock, ctx);
690
691 return ret;
692}
693EXPORT_SYMBOL_GPL(__ww_mutex_lock);
694
695int __sched
696__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
697{
698 int ret;
699
700 might_sleep();
701 ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
702 0, &ctx->dep_map, _RET_IP_, ctx);
703
704 if (!ret && ctx->acquired > 0)
705 return ww_mutex_deadlock_injection(lock, ctx);
706
707 return ret;
708}
709EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
710
445#endif 711#endif
446 712
447/* 713/*
@@ -494,10 +760,10 @@ __mutex_unlock_slowpath(atomic_t *lock_count)
494 * mutex_lock_interruptible() and mutex_trylock(). 760 * mutex_lock_interruptible() and mutex_trylock().
495 */ 761 */
496static noinline int __sched 762static noinline int __sched
497__mutex_lock_killable_slowpath(atomic_t *lock_count); 763__mutex_lock_killable_slowpath(struct mutex *lock);
498 764
499static noinline int __sched 765static noinline int __sched
500__mutex_lock_interruptible_slowpath(atomic_t *lock_count); 766__mutex_lock_interruptible_slowpath(struct mutex *lock);
501 767
502/** 768/**
503 * mutex_lock_interruptible - acquire the mutex, interruptible 769 * mutex_lock_interruptible - acquire the mutex, interruptible
@@ -515,12 +781,12 @@ int __sched mutex_lock_interruptible(struct mutex *lock)
515 int ret; 781 int ret;
516 782
517 might_sleep(); 783 might_sleep();
518 ret = __mutex_fastpath_lock_retval 784 ret = __mutex_fastpath_lock_retval(&lock->count);
519 (&lock->count, __mutex_lock_interruptible_slowpath); 785 if (likely(!ret)) {
520 if (!ret)
521 mutex_set_owner(lock); 786 mutex_set_owner(lock);
522 787 return 0;
523 return ret; 788 } else
789 return __mutex_lock_interruptible_slowpath(lock);
524} 790}
525 791
526EXPORT_SYMBOL(mutex_lock_interruptible); 792EXPORT_SYMBOL(mutex_lock_interruptible);
@@ -530,12 +796,12 @@ int __sched mutex_lock_killable(struct mutex *lock)
530 int ret; 796 int ret;
531 797
532 might_sleep(); 798 might_sleep();
533 ret = __mutex_fastpath_lock_retval 799 ret = __mutex_fastpath_lock_retval(&lock->count);
534 (&lock->count, __mutex_lock_killable_slowpath); 800 if (likely(!ret)) {
535 if (!ret)
536 mutex_set_owner(lock); 801 mutex_set_owner(lock);
537 802 return 0;
538 return ret; 803 } else
804 return __mutex_lock_killable_slowpath(lock);
539} 805}
540EXPORT_SYMBOL(mutex_lock_killable); 806EXPORT_SYMBOL(mutex_lock_killable);
541 807
@@ -544,24 +810,39 @@ __mutex_lock_slowpath(atomic_t *lock_count)
544{ 810{
545 struct mutex *lock = container_of(lock_count, struct mutex, count); 811 struct mutex *lock = container_of(lock_count, struct mutex, count);
546 812
547 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); 813 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
814 NULL, _RET_IP_, NULL);
548} 815}
549 816
550static noinline int __sched 817static noinline int __sched
551__mutex_lock_killable_slowpath(atomic_t *lock_count) 818__mutex_lock_killable_slowpath(struct mutex *lock)
552{ 819{
553 struct mutex *lock = container_of(lock_count, struct mutex, count); 820 return __mutex_lock_common(lock, TASK_KILLABLE, 0,
821 NULL, _RET_IP_, NULL);
822}
554 823
555 return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); 824static noinline int __sched
825__mutex_lock_interruptible_slowpath(struct mutex *lock)
826{
827 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
828 NULL, _RET_IP_, NULL);
556} 829}
557 830
558static noinline int __sched 831static noinline int __sched
559__mutex_lock_interruptible_slowpath(atomic_t *lock_count) 832__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
560{ 833{
561 struct mutex *lock = container_of(lock_count, struct mutex, count); 834 return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
835 NULL, _RET_IP_, ctx);
836}
562 837
563 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); 838static noinline int __sched
839__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
840 struct ww_acquire_ctx *ctx)
841{
842 return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0,
843 NULL, _RET_IP_, ctx);
564} 844}
845
565#endif 846#endif
566 847
567/* 848/*
@@ -617,6 +898,45 @@ int __sched mutex_trylock(struct mutex *lock)
617} 898}
618EXPORT_SYMBOL(mutex_trylock); 899EXPORT_SYMBOL(mutex_trylock);
619 900
901#ifndef CONFIG_DEBUG_LOCK_ALLOC
902int __sched
903__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
904{
905 int ret;
906
907 might_sleep();
908
909 ret = __mutex_fastpath_lock_retval(&lock->base.count);
910
911 if (likely(!ret)) {
912 ww_mutex_set_context_fastpath(lock, ctx);
913 mutex_set_owner(&lock->base);
914 } else
915 ret = __ww_mutex_lock_slowpath(lock, ctx);
916 return ret;
917}
918EXPORT_SYMBOL(__ww_mutex_lock);
919
920int __sched
921__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
922{
923 int ret;
924
925 might_sleep();
926
927 ret = __mutex_fastpath_lock_retval(&lock->base.count);
928
929 if (likely(!ret)) {
930 ww_mutex_set_context_fastpath(lock, ctx);
931 mutex_set_owner(&lock->base);
932 } else
933 ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx);
934 return ret;
935}
936EXPORT_SYMBOL(__ww_mutex_lock_interruptible);
937
938#endif
939
620/** 940/**
621 * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 941 * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
622 * @cnt: the atomic which we are to dec 942 * @cnt: the atomic which we are to dec
diff --git a/kernel/pid.c b/kernel/pid.c
index 0db3e791a06d..66505c1dfc51 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -75,6 +75,7 @@ struct pid_namespace init_pid_ns = {
75 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } 75 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
76 }, 76 },
77 .last_pid = 0, 77 .last_pid = 0,
78 .nr_hashed = PIDNS_HASH_ADDING,
78 .level = 0, 79 .level = 0,
79 .child_reaper = &init_task, 80 .child_reaper = &init_task,
80 .user_ns = &init_user_ns, 81 .user_ns = &init_user_ns,
@@ -373,14 +374,10 @@ EXPORT_SYMBOL_GPL(find_vpid);
373/* 374/*
374 * attach_pid() must be called with the tasklist_lock write-held. 375 * attach_pid() must be called with the tasklist_lock write-held.
375 */ 376 */
376void attach_pid(struct task_struct *task, enum pid_type type, 377void attach_pid(struct task_struct *task, enum pid_type type)
377 struct pid *pid)
378{ 378{
379 struct pid_link *link; 379 struct pid_link *link = &task->pids[type];
380 380 hlist_add_head_rcu(&link->node, &link->pid->tasks[type]);
381 link = &task->pids[type];
382 link->pid = pid;
383 hlist_add_head_rcu(&link->node, &pid->tasks[type]);
384} 381}
385 382
386static void __change_pid(struct task_struct *task, enum pid_type type, 383static void __change_pid(struct task_struct *task, enum pid_type type,
@@ -412,7 +409,7 @@ void change_pid(struct task_struct *task, enum pid_type type,
412 struct pid *pid) 409 struct pid *pid)
413{ 410{
414 __change_pid(task, type, pid); 411 __change_pid(task, type, pid);
415 attach_pid(task, type, pid); 412 attach_pid(task, type);
416} 413}
417 414
418/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ 415/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
@@ -594,7 +591,6 @@ void __init pidmap_init(void)
594 /* Reserve PID 0. We never call free_pidmap(0) */ 591 /* Reserve PID 0. We never call free_pidmap(0) */
595 set_bit(0, init_pid_ns.pidmap[0].page); 592 set_bit(0, init_pid_ns.pidmap[0].page);
596 atomic_dec(&init_pid_ns.pidmap[0].nr_free); 593 atomic_dec(&init_pid_ns.pidmap[0].nr_free);
597 init_pid_ns.nr_hashed = PIDNS_HASH_ADDING;
598 594
599 init_pid_ns.pid_cachep = KMEM_CACHE(pid, 595 init_pid_ns.pid_cachep = KMEM_CACHE(pid,
600 SLAB_HWCACHE_ALIGN | SLAB_PANIC); 596 SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 5dfdc9ea180b..d444c4e834f4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -100,7 +100,6 @@ config PM_SLEEP_SMP
100 depends on SMP 100 depends on SMP
101 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE 101 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
102 depends on PM_SLEEP 102 depends on PM_SLEEP
103 select HOTPLUG
104 select HOTPLUG_CPU 103 select HOTPLUG_CPU
105 104
106config PM_AUTOSLEEP 105config PM_AUTOSLEEP
@@ -263,6 +262,26 @@ config PM_GENERIC_DOMAINS
263 bool 262 bool
264 depends on PM 263 depends on PM
265 264
265config WQ_POWER_EFFICIENT_DEFAULT
266 bool "Enable workqueue power-efficient mode by default"
267 depends on PM
268 default n
269 help
270 Per-cpu workqueues are generally preferred because they show
271 better performance thanks to cache locality; unfortunately,
272 per-cpu workqueues tend to be more power hungry than unbound
273 workqueues.
274
275 Enabling workqueue.power_efficient kernel parameter makes the
276 per-cpu workqueues which were observed to contribute
277 significantly to power consumption unbound, leading to measurably
278 lower power usage at the cost of small performance overhead.
279
280 This config option determines whether workqueue.power_efficient
281 is enabled by default.
282
283 If in doubt, say N.
284
266config PM_GENERIC_DOMAINS_SLEEP 285config PM_GENERIC_DOMAINS_SLEEP
267 def_bool y 286 def_bool y
268 depends on PM_SLEEP && PM_GENERIC_DOMAINS 287 depends on PM_SLEEP && PM_GENERIC_DOMAINS
diff --git a/kernel/power/main.c b/kernel/power/main.c
index d77663bfedeb..1d1bf630e6e9 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -424,6 +424,8 @@ static ssize_t wakeup_count_store(struct kobject *kobj,
424 if (sscanf(buf, "%u", &val) == 1) { 424 if (sscanf(buf, "%u", &val) == 1) {
425 if (pm_save_wakeup_count(val)) 425 if (pm_save_wakeup_count(val))
426 error = n; 426 error = n;
427 else
428 pm_print_active_wakeup_sources();
427 } 429 }
428 430
429 out: 431 out:
@@ -528,6 +530,10 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
528 530
529 if (sscanf(buf, "%d", &val) == 1) { 531 if (sscanf(buf, "%d", &val) == 1) {
530 pm_trace_enabled = !!val; 532 pm_trace_enabled = !!val;
533 if (pm_trace_enabled) {
534 pr_warn("PM: Enabling pm_trace changes system date and time during resume.\n"
535 "PM: Correct system time has to be restored manually after resume.\n");
536 }
531 return n; 537 return n;
532 } 538 }
533 return -EINVAL; 539 return -EINVAL;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 98088e0e71e8..fc0df8486449 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -30,9 +30,10 @@ static int try_to_freeze_tasks(bool user_only)
30 unsigned int todo; 30 unsigned int todo;
31 bool wq_busy = false; 31 bool wq_busy = false;
32 struct timeval start, end; 32 struct timeval start, end;
33 u64 elapsed_csecs64; 33 u64 elapsed_msecs64;
34 unsigned int elapsed_csecs; 34 unsigned int elapsed_msecs;
35 bool wakeup = false; 35 bool wakeup = false;
36 int sleep_usecs = USEC_PER_MSEC;
36 37
37 do_gettimeofday(&start); 38 do_gettimeofday(&start);
38 39
@@ -68,22 +69,25 @@ static int try_to_freeze_tasks(bool user_only)
68 69
69 /* 70 /*
70 * We need to retry, but first give the freezing tasks some 71 * We need to retry, but first give the freezing tasks some
71 * time to enter the refrigerator. 72 * time to enter the refrigerator. Start with an initial
73 * 1 ms sleep followed by exponential backoff until 8 ms.
72 */ 74 */
73 msleep(10); 75 usleep_range(sleep_usecs / 2, sleep_usecs);
76 if (sleep_usecs < 8 * USEC_PER_MSEC)
77 sleep_usecs *= 2;
74 } 78 }
75 79
76 do_gettimeofday(&end); 80 do_gettimeofday(&end);
77 elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); 81 elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
78 do_div(elapsed_csecs64, NSEC_PER_SEC / 100); 82 do_div(elapsed_msecs64, NSEC_PER_MSEC);
79 elapsed_csecs = elapsed_csecs64; 83 elapsed_msecs = elapsed_msecs64;
80 84
81 if (todo) { 85 if (todo) {
82 printk("\n"); 86 printk("\n");
83 printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " 87 printk(KERN_ERR "Freezing of tasks %s after %d.%03d seconds "
84 "(%d tasks refusing to freeze, wq_busy=%d):\n", 88 "(%d tasks refusing to freeze, wq_busy=%d):\n",
85 wakeup ? "aborted" : "failed", 89 wakeup ? "aborted" : "failed",
86 elapsed_csecs / 100, elapsed_csecs % 100, 90 elapsed_msecs / 1000, elapsed_msecs % 1000,
87 todo - wq_busy, wq_busy); 91 todo - wq_busy, wq_busy);
88 92
89 if (!wakeup) { 93 if (!wakeup) {
@@ -96,8 +100,8 @@ static int try_to_freeze_tasks(bool user_only)
96 read_unlock(&tasklist_lock); 100 read_unlock(&tasklist_lock);
97 } 101 }
98 } else { 102 } else {
99 printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, 103 printk("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
100 elapsed_csecs % 100); 104 elapsed_msecs % 1000);
101 } 105 }
102 106
103 return todo ? -EBUSY : 0; 107 return todo ? -EBUSY : 0;
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 587dddeebf15..06fe28589e9c 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -44,6 +44,7 @@
44 44
45#include <linux/uaccess.h> 45#include <linux/uaccess.h>
46#include <linux/export.h> 46#include <linux/export.h>
47#include <trace/events/power.h>
47 48
48/* 49/*
49 * locking rule: all changes to constraints or notifiers lists 50 * locking rule: all changes to constraints or notifiers lists
@@ -202,6 +203,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
202 203
203 spin_unlock_irqrestore(&pm_qos_lock, flags); 204 spin_unlock_irqrestore(&pm_qos_lock, flags);
204 205
206 trace_pm_qos_update_target(action, prev_value, curr_value);
205 if (prev_value != curr_value) { 207 if (prev_value != curr_value) {
206 blocking_notifier_call_chain(c->notifiers, 208 blocking_notifier_call_chain(c->notifiers,
207 (unsigned long)curr_value, 209 (unsigned long)curr_value,
@@ -272,6 +274,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf,
272 274
273 spin_unlock_irqrestore(&pm_qos_lock, irqflags); 275 spin_unlock_irqrestore(&pm_qos_lock, irqflags);
274 276
277 trace_pm_qos_update_flags(action, prev_value, curr_value);
275 return prev_value != curr_value; 278 return prev_value != curr_value;
276} 279}
277 280
@@ -333,6 +336,7 @@ void pm_qos_add_request(struct pm_qos_request *req,
333 } 336 }
334 req->pm_qos_class = pm_qos_class; 337 req->pm_qos_class = pm_qos_class;
335 INIT_DELAYED_WORK(&req->work, pm_qos_work_fn); 338 INIT_DELAYED_WORK(&req->work, pm_qos_work_fn);
339 trace_pm_qos_add_request(pm_qos_class, value);
336 pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, 340 pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
337 &req->node, PM_QOS_ADD_REQ, value); 341 &req->node, PM_QOS_ADD_REQ, value);
338} 342}
@@ -361,6 +365,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
361 365
362 cancel_delayed_work_sync(&req->work); 366 cancel_delayed_work_sync(&req->work);
363 367
368 trace_pm_qos_update_request(req->pm_qos_class, new_value);
364 if (new_value != req->node.prio) 369 if (new_value != req->node.prio)
365 pm_qos_update_target( 370 pm_qos_update_target(
366 pm_qos_array[req->pm_qos_class]->constraints, 371 pm_qos_array[req->pm_qos_class]->constraints,
@@ -387,6 +392,8 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
387 392
388 cancel_delayed_work_sync(&req->work); 393 cancel_delayed_work_sync(&req->work);
389 394
395 trace_pm_qos_update_request_timeout(req->pm_qos_class,
396 new_value, timeout_us);
390 if (new_value != req->node.prio) 397 if (new_value != req->node.prio)
391 pm_qos_update_target( 398 pm_qos_update_target(
392 pm_qos_array[req->pm_qos_class]->constraints, 399 pm_qos_array[req->pm_qos_class]->constraints,
@@ -416,6 +423,7 @@ void pm_qos_remove_request(struct pm_qos_request *req)
416 423
417 cancel_delayed_work_sync(&req->work); 424 cancel_delayed_work_sync(&req->work);
418 425
426 trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE);
419 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, 427 pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
420 &req->node, PM_QOS_REMOVE_REQ, 428 &req->node, PM_QOS_REMOVE_REQ,
421 PM_QOS_DEFAULT_VALUE); 429 PM_QOS_DEFAULT_VALUE);
@@ -477,7 +485,7 @@ static int find_pm_qos_object_by_minor(int minor)
477{ 485{
478 int pm_qos_class; 486 int pm_qos_class;
479 487
480 for (pm_qos_class = 0; 488 for (pm_qos_class = PM_QOS_CPU_DMA_LATENCY;
481 pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { 489 pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
482 if (minor == 490 if (minor ==
483 pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) 491 pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
@@ -491,7 +499,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
491 long pm_qos_class; 499 long pm_qos_class;
492 500
493 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 501 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
494 if (pm_qos_class >= 0) { 502 if (pm_qos_class >= PM_QOS_CPU_DMA_LATENCY) {
495 struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL); 503 struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL);
496 if (!req) 504 if (!req)
497 return -ENOMEM; 505 return -ENOMEM;
@@ -584,7 +592,7 @@ static int __init pm_qos_power_init(void)
584 592
585 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); 593 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
586 594
587 for (i = 1; i < PM_QOS_NUM_CLASSES; i++) { 595 for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
588 ret = register_pm_qos_misc(pm_qos_array[i]); 596 ret = register_pm_qos_misc(pm_qos_array[i]);
589 if (ret < 0) { 597 if (ret < 0) {
590 printk(KERN_ERR "pm_qos_param: %s setup failed\n", 598 printk(KERN_ERR "pm_qos_param: %s setup failed\n",
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0de28576807d..349587bb03e1 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -642,8 +642,9 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
642 region->end_pfn = end_pfn; 642 region->end_pfn = end_pfn;
643 list_add_tail(&region->list, &nosave_regions); 643 list_add_tail(&region->list, &nosave_regions);
644 Report: 644 Report:
645 printk(KERN_INFO "PM: Registered nosave memory: %016lx - %016lx\n", 645 printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n",
646 start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); 646 (unsigned long long) start_pfn << PAGE_SHIFT,
647 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
647} 648}
648 649
649/* 650/*
@@ -1651,7 +1652,7 @@ unsigned long snapshot_get_image_size(void)
1651static int init_header(struct swsusp_info *info) 1652static int init_header(struct swsusp_info *info)
1652{ 1653{
1653 memset(info, 0, sizeof(struct swsusp_info)); 1654 memset(info, 0, sizeof(struct swsusp_info));
1654 info->num_physpages = num_physpages; 1655 info->num_physpages = get_num_physpages();
1655 info->image_pages = nr_copy_pages; 1656 info->image_pages = nr_copy_pages;
1656 info->pages = snapshot_get_image_size(); 1657 info->pages = snapshot_get_image_size();
1657 info->size = info->pages; 1658 info->size = info->pages;
@@ -1795,7 +1796,7 @@ static int check_header(struct swsusp_info *info)
1795 char *reason; 1796 char *reason;
1796 1797
1797 reason = check_image_kernel(info); 1798 reason = check_image_kernel(info);
1798 if (!reason && info->num_physpages != num_physpages) 1799 if (!reason && info->num_physpages != get_num_physpages())
1799 reason = "memory size"; 1800 reason = "memory size";
1800 if (reason) { 1801 if (reason) {
1801 printk(KERN_ERR "PM: Image mismatch: %s\n", reason); 1802 printk(KERN_ERR "PM: Image mismatch: %s\n", reason);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index bef86d121eb2..ece04223bb1e 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -269,7 +269,7 @@ int suspend_devices_and_enter(suspend_state_t state)
269 suspend_test_start(); 269 suspend_test_start();
270 error = dpm_suspend_start(PMSG_SUSPEND); 270 error = dpm_suspend_start(PMSG_SUSPEND);
271 if (error) { 271 if (error) {
272 printk(KERN_ERR "PM: Some devices failed to suspend\n"); 272 pr_err("PM: Some devices failed to suspend, or early wake event detected\n");
273 goto Recover_platform; 273 goto Recover_platform;
274 } 274 }
275 suspend_test_finish("suspend devices"); 275 suspend_test_finish("suspend devices");
diff --git a/kernel/printk.c b/kernel/printk.c
index fa36e1494420..8212c1aef125 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -363,6 +363,53 @@ static void log_store(int facility, int level,
363 log_next_seq++; 363 log_next_seq++;
364} 364}
365 365
366#ifdef CONFIG_SECURITY_DMESG_RESTRICT
367int dmesg_restrict = 1;
368#else
369int dmesg_restrict;
370#endif
371
372static int syslog_action_restricted(int type)
373{
374 if (dmesg_restrict)
375 return 1;
376 /*
377 * Unless restricted, we allow "read all" and "get buffer size"
378 * for everybody.
379 */
380 return type != SYSLOG_ACTION_READ_ALL &&
381 type != SYSLOG_ACTION_SIZE_BUFFER;
382}
383
384static int check_syslog_permissions(int type, bool from_file)
385{
386 /*
387 * If this is from /proc/kmsg and we've already opened it, then we've
388 * already done the capabilities checks at open time.
389 */
390 if (from_file && type != SYSLOG_ACTION_OPEN)
391 return 0;
392
393 if (syslog_action_restricted(type)) {
394 if (capable(CAP_SYSLOG))
395 return 0;
396 /*
397 * For historical reasons, accept CAP_SYS_ADMIN too, with
398 * a warning.
399 */
400 if (capable(CAP_SYS_ADMIN)) {
401 pr_warn_once("%s (%d): Attempt to access syslog with "
402 "CAP_SYS_ADMIN but no CAP_SYSLOG "
403 "(deprecated).\n",
404 current->comm, task_pid_nr(current));
405 return 0;
406 }
407 return -EPERM;
408 }
409 return security_syslog(type);
410}
411
412
366/* /dev/kmsg - userspace message inject/listen interface */ 413/* /dev/kmsg - userspace message inject/listen interface */
367struct devkmsg_user { 414struct devkmsg_user {
368 u64 seq; 415 u64 seq;
@@ -620,7 +667,8 @@ static int devkmsg_open(struct inode *inode, struct file *file)
620 if ((file->f_flags & O_ACCMODE) == O_WRONLY) 667 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
621 return 0; 668 return 0;
622 669
623 err = security_syslog(SYSLOG_ACTION_READ_ALL); 670 err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
671 SYSLOG_FROM_READER);
624 if (err) 672 if (err)
625 return err; 673 return err;
626 674
@@ -813,45 +861,6 @@ static inline void boot_delay_msec(int level)
813} 861}
814#endif 862#endif
815 863
816#ifdef CONFIG_SECURITY_DMESG_RESTRICT
817int dmesg_restrict = 1;
818#else
819int dmesg_restrict;
820#endif
821
822static int syslog_action_restricted(int type)
823{
824 if (dmesg_restrict)
825 return 1;
826 /* Unless restricted, we allow "read all" and "get buffer size" for everybody */
827 return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
828}
829
830static int check_syslog_permissions(int type, bool from_file)
831{
832 /*
833 * If this is from /proc/kmsg and we've already opened it, then we've
834 * already done the capabilities checks at open time.
835 */
836 if (from_file && type != SYSLOG_ACTION_OPEN)
837 return 0;
838
839 if (syslog_action_restricted(type)) {
840 if (capable(CAP_SYSLOG))
841 return 0;
842 /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
843 if (capable(CAP_SYS_ADMIN)) {
844 printk_once(KERN_WARNING "%s (%d): "
845 "Attempt to access syslog with CAP_SYS_ADMIN "
846 "but no CAP_SYSLOG (deprecated).\n",
847 current->comm, task_pid_nr(current));
848 return 0;
849 }
850 return -EPERM;
851 }
852 return 0;
853}
854
855#if defined(CONFIG_PRINTK_TIME) 864#if defined(CONFIG_PRINTK_TIME)
856static bool printk_time = 1; 865static bool printk_time = 1;
857#else 866#else
@@ -1249,7 +1258,7 @@ out:
1249 1258
1250SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) 1259SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
1251{ 1260{
1252 return do_syslog(type, buf, len, SYSLOG_FROM_CALL); 1261 return do_syslog(type, buf, len, SYSLOG_FROM_READER);
1253} 1262}
1254 1263
1255/* 1264/*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index aed981a3f69c..ba5e6cea181a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -665,20 +665,22 @@ static int ptrace_peek_siginfo(struct task_struct *child,
665 if (unlikely(is_compat_task())) { 665 if (unlikely(is_compat_task())) {
666 compat_siginfo_t __user *uinfo = compat_ptr(data); 666 compat_siginfo_t __user *uinfo = compat_ptr(data);
667 667
668 ret = copy_siginfo_to_user32(uinfo, &info); 668 if (copy_siginfo_to_user32(uinfo, &info) ||
669 ret |= __put_user(info.si_code, &uinfo->si_code); 669 __put_user(info.si_code, &uinfo->si_code)) {
670 ret = -EFAULT;
671 break;
672 }
673
670 } else 674 } else
671#endif 675#endif
672 { 676 {
673 siginfo_t __user *uinfo = (siginfo_t __user *) data; 677 siginfo_t __user *uinfo = (siginfo_t __user *) data;
674 678
675 ret = copy_siginfo_to_user(uinfo, &info); 679 if (copy_siginfo_to_user(uinfo, &info) ||
676 ret |= __put_user(info.si_code, &uinfo->si_code); 680 __put_user(info.si_code, &uinfo->si_code)) {
677 } 681 ret = -EFAULT;
678 682 break;
679 if (ret) { 683 }
680 ret = -EFAULT;
681 break;
682 } 684 }
683 685
684 data += sizeof(siginfo_t); 686 data += sizeof(siginfo_t);
@@ -842,6 +844,47 @@ int ptrace_request(struct task_struct *child, long request,
842 ret = ptrace_setsiginfo(child, &siginfo); 844 ret = ptrace_setsiginfo(child, &siginfo);
843 break; 845 break;
844 846
847 case PTRACE_GETSIGMASK:
848 if (addr != sizeof(sigset_t)) {
849 ret = -EINVAL;
850 break;
851 }
852
853 if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t)))
854 ret = -EFAULT;
855 else
856 ret = 0;
857
858 break;
859
860 case PTRACE_SETSIGMASK: {
861 sigset_t new_set;
862
863 if (addr != sizeof(sigset_t)) {
864 ret = -EINVAL;
865 break;
866 }
867
868 if (copy_from_user(&new_set, datavp, sizeof(sigset_t))) {
869 ret = -EFAULT;
870 break;
871 }
872
873 sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
874
875 /*
876 * Every thread does recalc_sigpending() after resume, so
877 * retarget_shared_pending() and recalc_sigpending() are not
878 * called here.
879 */
880 spin_lock_irq(&child->sighand->siglock);
881 child->blocked = new_set;
882 spin_unlock_irq(&child->sighand->siglock);
883
884 ret = 0;
885 break;
886 }
887
845 case PTRACE_INTERRUPT: 888 case PTRACE_INTERRUPT:
846 /* 889 /*
847 * Stop tracee without any side-effect on signal or job 890 * Stop tracee without any side-effect on signal or job
@@ -946,8 +989,7 @@ int ptrace_request(struct task_struct *child, long request,
946 989
947#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 990#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
948 case PTRACE_GETREGSET: 991 case PTRACE_GETREGSET:
949 case PTRACE_SETREGSET: 992 case PTRACE_SETREGSET: {
950 {
951 struct iovec kiov; 993 struct iovec kiov;
952 struct iovec __user *uiov = datavp; 994 struct iovec __user *uiov = datavp;
953 995
diff --git a/kernel/range.c b/kernel/range.c
index eb911dbce267..322ea8e93e4b 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -4,7 +4,7 @@
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/init.h> 5#include <linux/init.h>
6#include <linux/sort.h> 6#include <linux/sort.h>
7 7#include <linux/string.h>
8#include <linux/range.h> 8#include <linux/range.h>
9 9
10int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) 10int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
@@ -32,9 +32,8 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
32 if (start >= end) 32 if (start >= end)
33 return nr_range; 33 return nr_range;
34 34
35 /* Try to merge it with old one: */ 35 /* get new start/end: */
36 for (i = 0; i < nr_range; i++) { 36 for (i = 0; i < nr_range; i++) {
37 u64 final_start, final_end;
38 u64 common_start, common_end; 37 u64 common_start, common_end;
39 38
40 if (!range[i].end) 39 if (!range[i].end)
@@ -45,14 +44,16 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
45 if (common_start > common_end) 44 if (common_start > common_end)
46 continue; 45 continue;
47 46
48 final_start = min(range[i].start, start); 47 /* new start/end, will add it back at last */
49 final_end = max(range[i].end, end); 48 start = min(range[i].start, start);
49 end = max(range[i].end, end);
50 50
51 /* clear it and add it back for further merge */ 51 memmove(&range[i], &range[i + 1],
52 range[i].start = 0; 52 (nr_range - (i + 1)) * sizeof(range[i]));
53 range[i].end = 0; 53 range[nr_range - 1].start = 0;
54 return add_range_with_merge(range, az, nr_range, 54 range[nr_range - 1].end = 0;
55 final_start, final_end); 55 nr_range--;
56 i--;
56 } 57 }
57 58
58 /* Need to add it: */ 59 /* Need to add it: */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 48ab70384a4c..cce6ba8bbace 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -104,31 +104,7 @@ void __rcu_read_unlock(void)
104} 104}
105EXPORT_SYMBOL_GPL(__rcu_read_unlock); 105EXPORT_SYMBOL_GPL(__rcu_read_unlock);
106 106
107/* 107#endif /* #ifdef CONFIG_PREEMPT_RCU */
108 * Check for a task exiting while in a preemptible-RCU read-side
109 * critical section, clean up if so. No need to issue warnings,
110 * as debug_check_no_locks_held() already does this if lockdep
111 * is enabled.
112 */
113void exit_rcu(void)
114{
115 struct task_struct *t = current;
116
117 if (likely(list_empty(&current->rcu_node_entry)))
118 return;
119 t->rcu_read_lock_nesting = 1;
120 barrier();
121 t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
122 __rcu_read_unlock();
123}
124
125#else /* #ifdef CONFIG_PREEMPT_RCU */
126
127void exit_rcu(void)
128{
129}
130
131#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
132 108
133#ifdef CONFIG_DEBUG_LOCK_ALLOC 109#ifdef CONFIG_DEBUG_LOCK_ALLOC
134static struct lock_class_key rcu_lock_key; 110static struct lock_class_key rcu_lock_key;
@@ -145,9 +121,6 @@ static struct lock_class_key rcu_sched_lock_key;
145struct lockdep_map rcu_sched_lock_map = 121struct lockdep_map rcu_sched_lock_map =
146 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); 122 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
147EXPORT_SYMBOL_GPL(rcu_sched_lock_map); 123EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
148#endif
149
150#ifdef CONFIG_DEBUG_LOCK_ALLOC
151 124
152int debug_lockdep_rcu_enabled(void) 125int debug_lockdep_rcu_enabled(void)
153{ 126{
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index a0714a51b6d7..aa344111de3e 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -44,7 +44,6 @@
44 44
45/* Forward declarations for rcutiny_plugin.h. */ 45/* Forward declarations for rcutiny_plugin.h. */
46struct rcu_ctrlblk; 46struct rcu_ctrlblk;
47static void invoke_rcu_callbacks(void);
48static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); 47static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
49static void rcu_process_callbacks(struct softirq_action *unused); 48static void rcu_process_callbacks(struct softirq_action *unused);
50static void __call_rcu(struct rcu_head *head, 49static void __call_rcu(struct rcu_head *head,
@@ -205,7 +204,7 @@ static int rcu_is_cpu_rrupt_from_idle(void)
205 */ 204 */
206static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 205static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
207{ 206{
208 reset_cpu_stall_ticks(rcp); 207 RCU_TRACE(reset_cpu_stall_ticks(rcp));
209 if (rcp->rcucblist != NULL && 208 if (rcp->rcucblist != NULL &&
210 rcp->donetail != rcp->curtail) { 209 rcp->donetail != rcp->curtail) {
211 rcp->donetail = rcp->curtail; 210 rcp->donetail = rcp->curtail;
@@ -227,7 +226,7 @@ void rcu_sched_qs(int cpu)
227 local_irq_save(flags); 226 local_irq_save(flags);
228 if (rcu_qsctr_help(&rcu_sched_ctrlblk) + 227 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
229 rcu_qsctr_help(&rcu_bh_ctrlblk)) 228 rcu_qsctr_help(&rcu_bh_ctrlblk))
230 invoke_rcu_callbacks(); 229 raise_softirq(RCU_SOFTIRQ);
231 local_irq_restore(flags); 230 local_irq_restore(flags);
232} 231}
233 232
@@ -240,7 +239,7 @@ void rcu_bh_qs(int cpu)
240 239
241 local_irq_save(flags); 240 local_irq_save(flags);
242 if (rcu_qsctr_help(&rcu_bh_ctrlblk)) 241 if (rcu_qsctr_help(&rcu_bh_ctrlblk))
243 invoke_rcu_callbacks(); 242 raise_softirq(RCU_SOFTIRQ);
244 local_irq_restore(flags); 243 local_irq_restore(flags);
245} 244}
246 245
@@ -252,12 +251,11 @@ void rcu_bh_qs(int cpu)
252 */ 251 */
253void rcu_check_callbacks(int cpu, int user) 252void rcu_check_callbacks(int cpu, int user)
254{ 253{
255 check_cpu_stalls(); 254 RCU_TRACE(check_cpu_stalls());
256 if (user || rcu_is_cpu_rrupt_from_idle()) 255 if (user || rcu_is_cpu_rrupt_from_idle())
257 rcu_sched_qs(cpu); 256 rcu_sched_qs(cpu);
258 else if (!in_softirq()) 257 else if (!in_softirq())
259 rcu_bh_qs(cpu); 258 rcu_bh_qs(cpu);
260 rcu_preempt_check_callbacks();
261} 259}
262 260
263/* 261/*
@@ -278,7 +276,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
278 ACCESS_ONCE(rcp->rcucblist), 276 ACCESS_ONCE(rcp->rcucblist),
279 need_resched(), 277 need_resched(),
280 is_idle_task(current), 278 is_idle_task(current),
281 rcu_is_callbacks_kthread())); 279 false));
282 return; 280 return;
283 } 281 }
284 282
@@ -290,7 +288,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
290 *rcp->donetail = NULL; 288 *rcp->donetail = NULL;
291 if (rcp->curtail == rcp->donetail) 289 if (rcp->curtail == rcp->donetail)
292 rcp->curtail = &rcp->rcucblist; 290 rcp->curtail = &rcp->rcucblist;
293 rcu_preempt_remove_callbacks(rcp);
294 rcp->donetail = &rcp->rcucblist; 291 rcp->donetail = &rcp->rcucblist;
295 local_irq_restore(flags); 292 local_irq_restore(flags);
296 293
@@ -309,14 +306,13 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
309 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); 306 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
310 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), 307 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
311 is_idle_task(current), 308 is_idle_task(current),
312 rcu_is_callbacks_kthread())); 309 false));
313} 310}
314 311
315static void rcu_process_callbacks(struct softirq_action *unused) 312static void rcu_process_callbacks(struct softirq_action *unused)
316{ 313{
317 __rcu_process_callbacks(&rcu_sched_ctrlblk); 314 __rcu_process_callbacks(&rcu_sched_ctrlblk);
318 __rcu_process_callbacks(&rcu_bh_ctrlblk); 315 __rcu_process_callbacks(&rcu_bh_ctrlblk);
319 rcu_preempt_process_callbacks();
320} 316}
321 317
322/* 318/*
@@ -382,3 +378,8 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
382 __call_rcu(head, func, &rcu_bh_ctrlblk); 378 __call_rcu(head, func, &rcu_bh_ctrlblk);
383} 379}
384EXPORT_SYMBOL_GPL(call_rcu_bh); 380EXPORT_SYMBOL_GPL(call_rcu_bh);
381
382void rcu_init(void)
383{
384 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
385}
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 8a233002faeb..0cd385acccfa 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -53,958 +53,10 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
53}; 53};
54 54
55#ifdef CONFIG_DEBUG_LOCK_ALLOC 55#ifdef CONFIG_DEBUG_LOCK_ALLOC
56#include <linux/kernel_stat.h>
57
56int rcu_scheduler_active __read_mostly; 58int rcu_scheduler_active __read_mostly;
57EXPORT_SYMBOL_GPL(rcu_scheduler_active); 59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
58#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
59
60#ifdef CONFIG_RCU_TRACE
61
62static void check_cpu_stall(struct rcu_ctrlblk *rcp)
63{
64 unsigned long j;
65 unsigned long js;
66
67 if (rcu_cpu_stall_suppress)
68 return;
69 rcp->ticks_this_gp++;
70 j = jiffies;
71 js = rcp->jiffies_stall;
72 if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
73 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
74 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
75 jiffies - rcp->gp_start, rcp->qlen);
76 dump_stack();
77 }
78 if (*rcp->curtail && ULONG_CMP_GE(j, js))
79 rcp->jiffies_stall = jiffies +
80 3 * rcu_jiffies_till_stall_check() + 3;
81 else if (ULONG_CMP_GE(j, js))
82 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
83}
84
85static void check_cpu_stall_preempt(void);
86
87#endif /* #ifdef CONFIG_RCU_TRACE */
88
89static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
90{
91#ifdef CONFIG_RCU_TRACE
92 rcp->ticks_this_gp = 0;
93 rcp->gp_start = jiffies;
94 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
95#endif /* #ifdef CONFIG_RCU_TRACE */
96}
97
98static void check_cpu_stalls(void)
99{
100 RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
101 RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
102 RCU_TRACE(check_cpu_stall_preempt());
103}
104
105#ifdef CONFIG_TINY_PREEMPT_RCU
106
107#include <linux/delay.h>
108
109/* Global control variables for preemptible RCU. */
110struct rcu_preempt_ctrlblk {
111 struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */
112 struct rcu_head **nexttail;
113 /* Tasks blocked in a preemptible RCU */
114 /* read-side critical section while an */
115 /* preemptible-RCU grace period is in */
116 /* progress must wait for a later grace */
117 /* period. This pointer points to the */
118 /* ->next pointer of the last task that */
119 /* must wait for a later grace period, or */
120 /* to &->rcb.rcucblist if there is no */
121 /* such task. */
122 struct list_head blkd_tasks;
123 /* Tasks blocked in RCU read-side critical */
124 /* section. Tasks are placed at the head */
125 /* of this list and age towards the tail. */
126 struct list_head *gp_tasks;
127 /* Pointer to the first task blocking the */
128 /* current grace period, or NULL if there */
129 /* is no such task. */
130 struct list_head *exp_tasks;
131 /* Pointer to first task blocking the */
132 /* current expedited grace period, or NULL */
133 /* if there is no such task. If there */
134 /* is no current expedited grace period, */
135 /* then there cannot be any such task. */
136#ifdef CONFIG_RCU_BOOST
137 struct list_head *boost_tasks;
138 /* Pointer to first task that needs to be */
139 /* priority-boosted, or NULL if no priority */
140 /* boosting is needed. If there is no */
141 /* current or expedited grace period, there */
142 /* can be no such task. */
143#endif /* #ifdef CONFIG_RCU_BOOST */
144 u8 gpnum; /* Current grace period. */
145 u8 gpcpu; /* Last grace period blocked by the CPU. */
146 u8 completed; /* Last grace period completed. */
147 /* If all three are equal, RCU is idle. */
148#ifdef CONFIG_RCU_BOOST
149 unsigned long boost_time; /* When to start boosting (jiffies) */
150#endif /* #ifdef CONFIG_RCU_BOOST */
151#ifdef CONFIG_RCU_TRACE
152 unsigned long n_grace_periods;
153#ifdef CONFIG_RCU_BOOST
154 unsigned long n_tasks_boosted;
155 /* Total number of tasks boosted. */
156 unsigned long n_exp_boosts;
157 /* Number of tasks boosted for expedited GP. */
158 unsigned long n_normal_boosts;
159 /* Number of tasks boosted for normal GP. */
160 unsigned long n_balk_blkd_tasks;
161 /* Refused to boost: no blocked tasks. */
162 unsigned long n_balk_exp_gp_tasks;
163 /* Refused to boost: nothing blocking GP. */
164 unsigned long n_balk_boost_tasks;
165 /* Refused to boost: already boosting. */
166 unsigned long n_balk_notyet;
167 /* Refused to boost: not yet time. */
168 unsigned long n_balk_nos;
169 /* Refused to boost: not sure why, though. */
170 /* This can happen due to race conditions. */
171#endif /* #ifdef CONFIG_RCU_BOOST */
172#endif /* #ifdef CONFIG_RCU_TRACE */
173};
174
175static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
176 .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
177 .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
178 .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
179 .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
180 RCU_TRACE(.rcb.name = "rcu_preempt")
181};
182
183static int rcu_preempted_readers_exp(void);
184static void rcu_report_exp_done(void);
185
186/*
187 * Return true if the CPU has not yet responded to the current grace period.
188 */
189static int rcu_cpu_blocking_cur_gp(void)
190{
191 return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
192}
193
194/*
195 * Check for a running RCU reader. Because there is only one CPU,
196 * there can be but one running RCU reader at a time. ;-)
197 *
198 * Returns zero if there are no running readers. Returns a positive
199 * number if there is at least one reader within its RCU read-side
200 * critical section. Returns a negative number if an outermost reader
201 * is in the midst of exiting from its RCU read-side critical section
202 *
203 * Returns zero if there are no running readers. Returns a positive
204 * number if there is at least one reader within its RCU read-side
205 * critical section. Returns a negative number if an outermost reader
206 * is in the midst of exiting from its RCU read-side critical section.
207 */
208static int rcu_preempt_running_reader(void)
209{
210 return current->rcu_read_lock_nesting;
211}
212
213/*
214 * Check for preempted RCU readers blocking any grace period.
215 * If the caller needs a reliable answer, it must disable hard irqs.
216 */
217static int rcu_preempt_blocked_readers_any(void)
218{
219 return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
220}
221
222/*
223 * Check for preempted RCU readers blocking the current grace period.
224 * If the caller needs a reliable answer, it must disable hard irqs.
225 */
226static int rcu_preempt_blocked_readers_cgp(void)
227{
228 return rcu_preempt_ctrlblk.gp_tasks != NULL;
229}
230
231/*
232 * Return true if another preemptible-RCU grace period is needed.
233 */
234static int rcu_preempt_needs_another_gp(void)
235{
236 return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
237}
238
239/*
240 * Return true if a preemptible-RCU grace period is in progress.
241 * The caller must disable hardirqs.
242 */
243static int rcu_preempt_gp_in_progress(void)
244{
245 return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
246}
247
248/*
249 * Advance a ->blkd_tasks-list pointer to the next entry, instead
250 * returning NULL if at the end of the list.
251 */
252static struct list_head *rcu_next_node_entry(struct task_struct *t)
253{
254 struct list_head *np;
255
256 np = t->rcu_node_entry.next;
257 if (np == &rcu_preempt_ctrlblk.blkd_tasks)
258 np = NULL;
259 return np;
260}
261
262#ifdef CONFIG_RCU_TRACE
263
264#ifdef CONFIG_RCU_BOOST
265static void rcu_initiate_boost_trace(void);
266#endif /* #ifdef CONFIG_RCU_BOOST */
267
268/*
269 * Dump additional statistice for TINY_PREEMPT_RCU.
270 */
271static void show_tiny_preempt_stats(struct seq_file *m)
272{
273 seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
274 rcu_preempt_ctrlblk.rcb.qlen,
275 rcu_preempt_ctrlblk.n_grace_periods,
276 rcu_preempt_ctrlblk.gpnum,
277 rcu_preempt_ctrlblk.gpcpu,
278 rcu_preempt_ctrlblk.completed,
279 "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
280 "N."[!rcu_preempt_ctrlblk.gp_tasks],
281 "E."[!rcu_preempt_ctrlblk.exp_tasks]);
282#ifdef CONFIG_RCU_BOOST
283 seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
284 " ",
285 "B."[!rcu_preempt_ctrlblk.boost_tasks],
286 rcu_preempt_ctrlblk.n_tasks_boosted,
287 rcu_preempt_ctrlblk.n_exp_boosts,
288 rcu_preempt_ctrlblk.n_normal_boosts,
289 (int)(jiffies & 0xffff),
290 (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
291 seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n",
292 " balk",
293 rcu_preempt_ctrlblk.n_balk_blkd_tasks,
294 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks,
295 rcu_preempt_ctrlblk.n_balk_boost_tasks,
296 rcu_preempt_ctrlblk.n_balk_notyet,
297 rcu_preempt_ctrlblk.n_balk_nos);
298#endif /* #ifdef CONFIG_RCU_BOOST */
299}
300
301#endif /* #ifdef CONFIG_RCU_TRACE */
302
303#ifdef CONFIG_RCU_BOOST
304
305#include "rtmutex_common.h"
306
307#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
308
309/* Controls for rcu_kthread() kthread. */
310static struct task_struct *rcu_kthread_task;
311static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
312static unsigned long have_rcu_kthread_work;
313
314/*
315 * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
316 * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
317 */
318static int rcu_boost(void)
319{
320 unsigned long flags;
321 struct rt_mutex mtx;
322 struct task_struct *t;
323 struct list_head *tb;
324
325 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
326 rcu_preempt_ctrlblk.exp_tasks == NULL)
327 return 0; /* Nothing to boost. */
328
329 local_irq_save(flags);
330
331 /*
332 * Recheck with irqs disabled: all tasks in need of boosting
333 * might exit their RCU read-side critical sections on their own
334 * if we are preempted just before disabling irqs.
335 */
336 if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
337 rcu_preempt_ctrlblk.exp_tasks == NULL) {
338 local_irq_restore(flags);
339 return 0;
340 }
341
342 /*
343 * Preferentially boost tasks blocking expedited grace periods.
344 * This cannot starve the normal grace periods because a second
345 * expedited grace period must boost all blocked tasks, including
346 * those blocking the pre-existing normal grace period.
347 */
348 if (rcu_preempt_ctrlblk.exp_tasks != NULL) {
349 tb = rcu_preempt_ctrlblk.exp_tasks;
350 RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
351 } else {
352 tb = rcu_preempt_ctrlblk.boost_tasks;
353 RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
354 }
355 RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
356
357 /*
358 * We boost task t by manufacturing an rt_mutex that appears to
359 * be held by task t. We leave a pointer to that rt_mutex where
360 * task t can find it, and task t will release the mutex when it
361 * exits its outermost RCU read-side critical section. Then
362 * simply acquiring this artificial rt_mutex will boost task
363 * t's priority. (Thanks to tglx for suggesting this approach!)
364 */
365 t = container_of(tb, struct task_struct, rcu_node_entry);
366 rt_mutex_init_proxy_locked(&mtx, t);
367 t->rcu_boost_mutex = &mtx;
368 local_irq_restore(flags);
369 rt_mutex_lock(&mtx);
370 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
371
372 return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL ||
373 ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL;
374}
375
376/*
377 * Check to see if it is now time to start boosting RCU readers blocking
378 * the current grace period, and, if so, tell the rcu_kthread_task to
379 * start boosting them. If there is an expedited boost in progress,
380 * we wait for it to complete.
381 *
382 * If there are no blocked readers blocking the current grace period,
383 * return 0 to let the caller know, otherwise return 1. Note that this
384 * return value is independent of whether or not boosting was done.
385 */
386static int rcu_initiate_boost(void)
387{
388 if (!rcu_preempt_blocked_readers_cgp() &&
389 rcu_preempt_ctrlblk.exp_tasks == NULL) {
390 RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++);
391 return 0;
392 }
393 if (rcu_preempt_ctrlblk.exp_tasks != NULL ||
394 (rcu_preempt_ctrlblk.gp_tasks != NULL &&
395 rcu_preempt_ctrlblk.boost_tasks == NULL &&
396 ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) {
397 if (rcu_preempt_ctrlblk.exp_tasks == NULL)
398 rcu_preempt_ctrlblk.boost_tasks =
399 rcu_preempt_ctrlblk.gp_tasks;
400 invoke_rcu_callbacks();
401 } else {
402 RCU_TRACE(rcu_initiate_boost_trace());
403 }
404 return 1;
405}
406
407#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
408
409/*
410 * Do priority-boost accounting for the start of a new grace period.
411 */
412static void rcu_preempt_boost_start_gp(void)
413{
414 rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
415}
416
417#else /* #ifdef CONFIG_RCU_BOOST */
418
419/*
420 * If there is no RCU priority boosting, we don't initiate boosting,
421 * but we do indicate whether there are blocked readers blocking the
422 * current grace period.
423 */
424static int rcu_initiate_boost(void)
425{
426 return rcu_preempt_blocked_readers_cgp();
427}
428
429/*
430 * If there is no RCU priority boosting, nothing to do at grace-period start.
431 */
432static void rcu_preempt_boost_start_gp(void)
433{
434}
435
436#endif /* else #ifdef CONFIG_RCU_BOOST */
437
438/*
439 * Record a preemptible-RCU quiescent state for the specified CPU. Note
440 * that this just means that the task currently running on the CPU is
441 * in a quiescent state. There might be any number of tasks blocked
442 * while in an RCU read-side critical section.
443 *
444 * Unlike the other rcu_*_qs() functions, callers to this function
445 * must disable irqs in order to protect the assignment to
446 * ->rcu_read_unlock_special.
447 *
448 * Because this is a single-CPU implementation, the only way a grace
449 * period can end is if the CPU is in a quiescent state. The reason is
450 * that a blocked preemptible-RCU reader can exit its critical section
451 * only if the CPU is running it at the time. Therefore, when the
452 * last task blocking the current grace period exits its RCU read-side
453 * critical section, neither the CPU nor blocked tasks will be stopping
454 * the current grace period. (In contrast, SMP implementations
455 * might have CPUs running in RCU read-side critical sections that
456 * block later grace periods -- but this is not possible given only
457 * one CPU.)
458 */
459static void rcu_preempt_cpu_qs(void)
460{
461 /* Record both CPU and task as having responded to current GP. */
462 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
463 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
464
465 /* If there is no GP then there is nothing more to do. */
466 if (!rcu_preempt_gp_in_progress())
467 return;
468 /*
469 * Check up on boosting. If there are readers blocking the
470 * current grace period, leave.
471 */
472 if (rcu_initiate_boost())
473 return;
474
475 /* Advance callbacks. */
476 rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
477 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
478 rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
479
480 /* If there are no blocked readers, next GP is done instantly. */
481 if (!rcu_preempt_blocked_readers_any())
482 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
483
484 /* If there are done callbacks, cause them to be invoked. */
485 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
486 invoke_rcu_callbacks();
487}
488
489/*
490 * Start a new RCU grace period if warranted. Hard irqs must be disabled.
491 */
492static void rcu_preempt_start_gp(void)
493{
494 if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
495
496 /* Official start of GP. */
497 rcu_preempt_ctrlblk.gpnum++;
498 RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
499 reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb);
500
501 /* Any blocked RCU readers block new GP. */
502 if (rcu_preempt_blocked_readers_any())
503 rcu_preempt_ctrlblk.gp_tasks =
504 rcu_preempt_ctrlblk.blkd_tasks.next;
505
506 /* Set up for RCU priority boosting. */
507 rcu_preempt_boost_start_gp();
508
509 /* If there is no running reader, CPU is done with GP. */
510 if (!rcu_preempt_running_reader())
511 rcu_preempt_cpu_qs();
512 }
513}
514
515/*
516 * We have entered the scheduler, and the current task might soon be
517 * context-switched away from. If this task is in an RCU read-side
518 * critical section, we will no longer be able to rely on the CPU to
519 * record that fact, so we enqueue the task on the blkd_tasks list.
520 * If the task started after the current grace period began, as recorded
521 * by ->gpcpu, we enqueue at the beginning of the list. Otherwise
522 * before the element referenced by ->gp_tasks (or at the tail if
523 * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
524 * The task will dequeue itself when it exits the outermost enclosing
525 * RCU read-side critical section. Therefore, the current grace period
526 * cannot be permitted to complete until the ->gp_tasks pointer becomes
527 * NULL.
528 *
529 * Caller must disable preemption.
530 */
531void rcu_preempt_note_context_switch(void)
532{
533 struct task_struct *t = current;
534 unsigned long flags;
535
536 local_irq_save(flags); /* must exclude scheduler_tick(). */
537 if (rcu_preempt_running_reader() > 0 &&
538 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
539
540 /* Possibly blocking in an RCU read-side critical section. */
541 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
542
543 /*
544 * If this CPU has already checked in, then this task
545 * will hold up the next grace period rather than the
546 * current grace period. Queue the task accordingly.
547 * If the task is queued for the current grace period
548 * (i.e., this CPU has not yet passed through a quiescent
549 * state for the current grace period), then as long
550 * as that task remains queued, the current grace period
551 * cannot end.
552 */
553 list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
554 if (rcu_cpu_blocking_cur_gp())
555 rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
556 } else if (rcu_preempt_running_reader() < 0 &&
557 t->rcu_read_unlock_special) {
558 /*
559 * Complete exit from RCU read-side critical section on
560 * behalf of preempted instance of __rcu_read_unlock().
561 */
562 rcu_read_unlock_special(t);
563 }
564
565 /*
566 * Either we were not in an RCU read-side critical section to
567 * begin with, or we have now recorded that critical section
568 * globally. Either way, we can now note a quiescent state
569 * for this CPU. Again, if we were in an RCU read-side critical
570 * section, and if that critical section was blocking the current
571 * grace period, then the fact that the task has been enqueued
572 * means that current grace period continues to be blocked.
573 */
574 rcu_preempt_cpu_qs();
575 local_irq_restore(flags);
576}
577
578/*
579 * Handle special cases during rcu_read_unlock(), such as needing to
580 * notify RCU core processing or task having blocked during the RCU
581 * read-side critical section.
582 */
583void rcu_read_unlock_special(struct task_struct *t)
584{
585 int empty;
586 int empty_exp;
587 unsigned long flags;
588 struct list_head *np;
589#ifdef CONFIG_RCU_BOOST
590 struct rt_mutex *rbmp = NULL;
591#endif /* #ifdef CONFIG_RCU_BOOST */
592 int special;
593
594 /*
595 * NMI handlers cannot block and cannot safely manipulate state.
596 * They therefore cannot possibly be special, so just leave.
597 */
598 if (in_nmi())
599 return;
600
601 local_irq_save(flags);
602
603 /*
604 * If RCU core is waiting for this CPU to exit critical section,
605 * let it know that we have done so.
606 */
607 special = t->rcu_read_unlock_special;
608 if (special & RCU_READ_UNLOCK_NEED_QS)
609 rcu_preempt_cpu_qs();
610
611 /* Hardware IRQ handlers cannot block. */
612 if (in_irq() || in_serving_softirq()) {
613 local_irq_restore(flags);
614 return;
615 }
616
617 /* Clean up if blocked during RCU read-side critical section. */
618 if (special & RCU_READ_UNLOCK_BLOCKED) {
619 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
620
621 /*
622 * Remove this task from the ->blkd_tasks list and adjust
623 * any pointers that might have been referencing it.
624 */
625 empty = !rcu_preempt_blocked_readers_cgp();
626 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
627 np = rcu_next_node_entry(t);
628 list_del_init(&t->rcu_node_entry);
629 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
630 rcu_preempt_ctrlblk.gp_tasks = np;
631 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
632 rcu_preempt_ctrlblk.exp_tasks = np;
633#ifdef CONFIG_RCU_BOOST
634 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
635 rcu_preempt_ctrlblk.boost_tasks = np;
636#endif /* #ifdef CONFIG_RCU_BOOST */
637
638 /*
639 * If this was the last task on the current list, and if
640 * we aren't waiting on the CPU, report the quiescent state
641 * and start a new grace period if needed.
642 */
643 if (!empty && !rcu_preempt_blocked_readers_cgp()) {
644 rcu_preempt_cpu_qs();
645 rcu_preempt_start_gp();
646 }
647
648 /*
649 * If this was the last task on the expedited lists,
650 * then we need wake up the waiting task.
651 */
652 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
653 rcu_report_exp_done();
654 }
655#ifdef CONFIG_RCU_BOOST
656 /* Unboost self if was boosted. */
657 if (t->rcu_boost_mutex != NULL) {
658 rbmp = t->rcu_boost_mutex;
659 t->rcu_boost_mutex = NULL;
660 rt_mutex_unlock(rbmp);
661 }
662#endif /* #ifdef CONFIG_RCU_BOOST */
663 local_irq_restore(flags);
664}
665
666/*
667 * Check for a quiescent state from the current CPU. When a task blocks,
668 * the task is recorded in the rcu_preempt_ctrlblk structure, which is
669 * checked elsewhere. This is called from the scheduling-clock interrupt.
670 *
671 * Caller must disable hard irqs.
672 */
673static void rcu_preempt_check_callbacks(void)
674{
675 struct task_struct *t = current;
676
677 if (rcu_preempt_gp_in_progress() &&
678 (!rcu_preempt_running_reader() ||
679 !rcu_cpu_blocking_cur_gp()))
680 rcu_preempt_cpu_qs();
681 if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
682 rcu_preempt_ctrlblk.rcb.donetail)
683 invoke_rcu_callbacks();
684 if (rcu_preempt_gp_in_progress() &&
685 rcu_cpu_blocking_cur_gp() &&
686 rcu_preempt_running_reader() > 0)
687 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
688}
689
690/*
691 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
692 * update, so this is invoked from rcu_process_callbacks() to
693 * handle that case. Of course, it is invoked for all flavors of
694 * RCU, but RCU callbacks can appear only on one of the lists, and
695 * neither ->nexttail nor ->donetail can possibly be NULL, so there
696 * is no need for an explicit check.
697 */
698static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
699{
700 if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
701 rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
702}
703
704/*
705 * Process callbacks for preemptible RCU.
706 */
707static void rcu_preempt_process_callbacks(void)
708{
709 __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
710}
711
712/*
713 * Queue a preemptible -RCU callback for invocation after a grace period.
714 */
715void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
716{
717 unsigned long flags;
718
719 debug_rcu_head_queue(head);
720 head->func = func;
721 head->next = NULL;
722
723 local_irq_save(flags);
724 *rcu_preempt_ctrlblk.nexttail = head;
725 rcu_preempt_ctrlblk.nexttail = &head->next;
726 RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
727 rcu_preempt_start_gp(); /* checks to see if GP needed. */
728 local_irq_restore(flags);
729}
730EXPORT_SYMBOL_GPL(call_rcu);
731
732/*
733 * synchronize_rcu - wait until a grace period has elapsed.
734 *
735 * Control will return to the caller some time after a full grace
736 * period has elapsed, in other words after all currently executing RCU
737 * read-side critical sections have completed. RCU read-side critical
738 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
739 * and may be nested.
740 */
741void synchronize_rcu(void)
742{
743 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
744 !lock_is_held(&rcu_lock_map) &&
745 !lock_is_held(&rcu_sched_lock_map),
746 "Illegal synchronize_rcu() in RCU read-side critical section");
747
748#ifdef CONFIG_DEBUG_LOCK_ALLOC
749 if (!rcu_scheduler_active)
750 return;
751#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
752
753 WARN_ON_ONCE(rcu_preempt_running_reader());
754 if (!rcu_preempt_blocked_readers_any())
755 return;
756
757 /* Once we get past the fastpath checks, same code as rcu_barrier(). */
758 if (rcu_expedited)
759 synchronize_rcu_expedited();
760 else
761 rcu_barrier();
762}
763EXPORT_SYMBOL_GPL(synchronize_rcu);
764
765static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
766static unsigned long sync_rcu_preempt_exp_count;
767static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
768
769/*
770 * Return non-zero if there are any tasks in RCU read-side critical
771 * sections blocking the current preemptible-RCU expedited grace period.
772 * If there is no preemptible-RCU expedited grace period currently in
773 * progress, returns zero unconditionally.
774 */
775static int rcu_preempted_readers_exp(void)
776{
777 return rcu_preempt_ctrlblk.exp_tasks != NULL;
778}
779
780/*
781 * Report the exit from RCU read-side critical section for the last task
782 * that queued itself during or before the current expedited preemptible-RCU
783 * grace period.
784 */
785static void rcu_report_exp_done(void)
786{
787 wake_up(&sync_rcu_preempt_exp_wq);
788}
789
790/*
791 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
792 * is to rely in the fact that there is but one CPU, and that it is
793 * illegal for a task to invoke synchronize_rcu_expedited() while in a
794 * preemptible-RCU read-side critical section. Therefore, any such
795 * critical sections must correspond to blocked tasks, which must therefore
796 * be on the ->blkd_tasks list. So just record the current head of the
797 * list in the ->exp_tasks pointer, and wait for all tasks including and
798 * after the task pointed to by ->exp_tasks to drain.
799 */
800void synchronize_rcu_expedited(void)
801{
802 unsigned long flags;
803 struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
804 unsigned long snap;
805
806 barrier(); /* ensure prior action seen before grace period. */
807
808 WARN_ON_ONCE(rcu_preempt_running_reader());
809
810 /*
811 * Acquire lock so that there is only one preemptible RCU grace
812 * period in flight. Of course, if someone does the expedited
813 * grace period for us while we are acquiring the lock, just leave.
814 */
815 snap = sync_rcu_preempt_exp_count + 1;
816 mutex_lock(&sync_rcu_preempt_exp_mutex);
817 if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
818 goto unlock_mb_ret; /* Others did our work for us. */
819
820 local_irq_save(flags);
821
822 /*
823 * All RCU readers have to already be on blkd_tasks because
824 * we cannot legally be executing in an RCU read-side critical
825 * section.
826 */
827
828 /* Snapshot current head of ->blkd_tasks list. */
829 rpcp->exp_tasks = rpcp->blkd_tasks.next;
830 if (rpcp->exp_tasks == &rpcp->blkd_tasks)
831 rpcp->exp_tasks = NULL;
832
833 /* Wait for tail of ->blkd_tasks list to drain. */
834 if (!rcu_preempted_readers_exp()) {
835 local_irq_restore(flags);
836 } else {
837 rcu_initiate_boost();
838 local_irq_restore(flags);
839 wait_event(sync_rcu_preempt_exp_wq,
840 !rcu_preempted_readers_exp());
841 }
842
843 /* Clean up and exit. */
844 barrier(); /* ensure expedited GP seen before counter increment. */
845 sync_rcu_preempt_exp_count++;
846unlock_mb_ret:
847 mutex_unlock(&sync_rcu_preempt_exp_mutex);
848 barrier(); /* ensure subsequent action seen after grace period. */
849}
850EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
851
852/*
853 * Does preemptible RCU need the CPU to stay out of dynticks mode?
854 */
855int rcu_preempt_needs_cpu(void)
856{
857 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
858}
859
860#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
861
862#ifdef CONFIG_RCU_TRACE
863
864/*
865 * Because preemptible RCU does not exist, it is not necessary to
866 * dump out its statistics.
867 */
868static void show_tiny_preempt_stats(struct seq_file *m)
869{
870}
871
872#endif /* #ifdef CONFIG_RCU_TRACE */
873
874/*
875 * Because preemptible RCU does not exist, it never has any callbacks
876 * to check.
877 */
878static void rcu_preempt_check_callbacks(void)
879{
880}
881
882/*
883 * Because preemptible RCU does not exist, it never has any callbacks
884 * to remove.
885 */
886static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
887{
888}
889
890/*
891 * Because preemptible RCU does not exist, it never has any callbacks
892 * to process.
893 */
894static void rcu_preempt_process_callbacks(void)
895{
896}
897
898#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
899
900#ifdef CONFIG_RCU_BOOST
901
902/*
903 * Wake up rcu_kthread() to process callbacks now eligible for invocation
904 * or to boost readers.
905 */
906static void invoke_rcu_callbacks(void)
907{
908 have_rcu_kthread_work = 1;
909 if (rcu_kthread_task != NULL)
910 wake_up(&rcu_kthread_wq);
911}
912
913#ifdef CONFIG_RCU_TRACE
914
915/*
916 * Is the current CPU running the RCU-callbacks kthread?
917 * Caller must have preemption disabled.
918 */
919static bool rcu_is_callbacks_kthread(void)
920{
921 return rcu_kthread_task == current;
922}
923
924#endif /* #ifdef CONFIG_RCU_TRACE */
925
926/*
927 * This kthread invokes RCU callbacks whose grace periods have
928 * elapsed. It is awakened as needed, and takes the place of the
929 * RCU_SOFTIRQ that is used for this purpose when boosting is disabled.
930 * This is a kthread, but it is never stopped, at least not until
931 * the system goes down.
932 */
933static int rcu_kthread(void *arg)
934{
935 unsigned long work;
936 unsigned long morework;
937 unsigned long flags;
938
939 for (;;) {
940 wait_event_interruptible(rcu_kthread_wq,
941 have_rcu_kthread_work != 0);
942 morework = rcu_boost();
943 local_irq_save(flags);
944 work = have_rcu_kthread_work;
945 have_rcu_kthread_work = morework;
946 local_irq_restore(flags);
947 if (work)
948 rcu_process_callbacks(NULL);
949 schedule_timeout_interruptible(1); /* Leave CPU for others. */
950 }
951
952 return 0; /* Not reached, but needed to shut gcc up. */
953}
954
955/*
956 * Spawn the kthread that invokes RCU callbacks.
957 */
958static int __init rcu_spawn_kthreads(void)
959{
960 struct sched_param sp;
961
962 rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
963 sp.sched_priority = RCU_BOOST_PRIO;
964 sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
965 return 0;
966}
967early_initcall(rcu_spawn_kthreads);
968
969#else /* #ifdef CONFIG_RCU_BOOST */
970
971/* Hold off callback invocation until early_initcall() time. */
972static int rcu_scheduler_fully_active __read_mostly;
973
974/*
975 * Start up softirq processing of callbacks.
976 */
977void invoke_rcu_callbacks(void)
978{
979 if (rcu_scheduler_fully_active)
980 raise_softirq(RCU_SOFTIRQ);
981}
982
983#ifdef CONFIG_RCU_TRACE
984
985/*
986 * There is no callback kthread, so this thread is never it.
987 */
988static bool rcu_is_callbacks_kthread(void)
989{
990 return false;
991}
992
993#endif /* #ifdef CONFIG_RCU_TRACE */
994
995static int __init rcu_scheduler_really_started(void)
996{
997 rcu_scheduler_fully_active = 1;
998 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
999 raise_softirq(RCU_SOFTIRQ); /* Invoke any callbacks from early boot. */
1000 return 0;
1001}
1002early_initcall(rcu_scheduler_really_started);
1003
1004#endif /* #else #ifdef CONFIG_RCU_BOOST */
1005
1006#ifdef CONFIG_DEBUG_LOCK_ALLOC
1007#include <linux/kernel_stat.h>
1008 60
1009/* 61/*
1010 * During boot, we forgive RCU lockdep issues. After this function is 62 * During boot, we forgive RCU lockdep issues. After this function is
@@ -1020,25 +72,6 @@ void __init rcu_scheduler_starting(void)
1020 72
1021#ifdef CONFIG_RCU_TRACE 73#ifdef CONFIG_RCU_TRACE
1022 74
1023#ifdef CONFIG_RCU_BOOST
1024
1025static void rcu_initiate_boost_trace(void)
1026{
1027 if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
1028 rcu_preempt_ctrlblk.n_balk_blkd_tasks++;
1029 else if (rcu_preempt_ctrlblk.gp_tasks == NULL &&
1030 rcu_preempt_ctrlblk.exp_tasks == NULL)
1031 rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++;
1032 else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
1033 rcu_preempt_ctrlblk.n_balk_boost_tasks++;
1034 else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
1035 rcu_preempt_ctrlblk.n_balk_notyet++;
1036 else
1037 rcu_preempt_ctrlblk.n_balk_nos++;
1038}
1039
1040#endif /* #ifdef CONFIG_RCU_BOOST */
1041
1042static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) 75static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
1043{ 76{
1044 unsigned long flags; 77 unsigned long flags;
@@ -1053,7 +86,6 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
1053 */ 86 */
1054static int show_tiny_stats(struct seq_file *m, void *unused) 87static int show_tiny_stats(struct seq_file *m, void *unused)
1055{ 88{
1056 show_tiny_preempt_stats(m);
1057 seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); 89 seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
1058 seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); 90 seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
1059 return 0; 91 return 0;
@@ -1103,11 +135,40 @@ MODULE_AUTHOR("Paul E. McKenney");
1103MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); 135MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
1104MODULE_LICENSE("GPL"); 136MODULE_LICENSE("GPL");
1105 137
1106static void check_cpu_stall_preempt(void) 138static void check_cpu_stall(struct rcu_ctrlblk *rcp)
1107{ 139{
1108#ifdef CONFIG_TINY_PREEMPT_RCU 140 unsigned long j;
1109 check_cpu_stall(&rcu_preempt_ctrlblk.rcb); 141 unsigned long js;
1110#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 142
143 if (rcu_cpu_stall_suppress)
144 return;
145 rcp->ticks_this_gp++;
146 j = jiffies;
147 js = rcp->jiffies_stall;
148 if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
149 pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
150 rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
151 jiffies - rcp->gp_start, rcp->qlen);
152 dump_stack();
153 }
154 if (*rcp->curtail && ULONG_CMP_GE(j, js))
155 rcp->jiffies_stall = jiffies +
156 3 * rcu_jiffies_till_stall_check() + 3;
157 else if (ULONG_CMP_GE(j, js))
158 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
159}
160
161static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
162{
163 rcp->ticks_this_gp = 0;
164 rcp->gp_start = jiffies;
165 rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
166}
167
168static void check_cpu_stalls(void)
169{
170 RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
171 RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
1111} 172}
1112 173
1113#endif /* #ifdef CONFIG_RCU_TRACE */ 174#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e1f3a8c96724..b1fa5510388d 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -695,44 +695,6 @@ static struct rcu_torture_ops srcu_sync_ops = {
695 .name = "srcu_sync" 695 .name = "srcu_sync"
696}; 696};
697 697
698static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
699{
700 return srcu_read_lock_raw(&srcu_ctl);
701}
702
703static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
704{
705 srcu_read_unlock_raw(&srcu_ctl, idx);
706}
707
708static struct rcu_torture_ops srcu_raw_ops = {
709 .init = rcu_sync_torture_init,
710 .readlock = srcu_torture_read_lock_raw,
711 .read_delay = srcu_read_delay,
712 .readunlock = srcu_torture_read_unlock_raw,
713 .completed = srcu_torture_completed,
714 .deferred_free = srcu_torture_deferred_free,
715 .sync = srcu_torture_synchronize,
716 .call = NULL,
717 .cb_barrier = NULL,
718 .stats = srcu_torture_stats,
719 .name = "srcu_raw"
720};
721
722static struct rcu_torture_ops srcu_raw_sync_ops = {
723 .init = rcu_sync_torture_init,
724 .readlock = srcu_torture_read_lock_raw,
725 .read_delay = srcu_read_delay,
726 .readunlock = srcu_torture_read_unlock_raw,
727 .completed = srcu_torture_completed,
728 .deferred_free = rcu_sync_torture_deferred_free,
729 .sync = srcu_torture_synchronize,
730 .call = NULL,
731 .cb_barrier = NULL,
732 .stats = srcu_torture_stats,
733 .name = "srcu_raw_sync"
734};
735
736static void srcu_torture_synchronize_expedited(void) 698static void srcu_torture_synchronize_expedited(void)
737{ 699{
738 synchronize_srcu_expedited(&srcu_ctl); 700 synchronize_srcu_expedited(&srcu_ctl);
@@ -1983,7 +1945,6 @@ rcu_torture_init(void)
1983 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1945 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1984 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1946 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1985 &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, 1947 &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops,
1986 &srcu_raw_ops, &srcu_raw_sync_ops,
1987 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1948 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1988 1949
1989 mutex_lock(&fullstop_mutex); 1950 mutex_lock(&fullstop_mutex);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 16ea67925015..e08abb9461ac 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -218,8 +218,8 @@ module_param(blimit, long, 0444);
218module_param(qhimark, long, 0444); 218module_param(qhimark, long, 0444);
219module_param(qlowmark, long, 0444); 219module_param(qlowmark, long, 0444);
220 220
221static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; 221static ulong jiffies_till_first_fqs = ULONG_MAX;
222static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; 222static ulong jiffies_till_next_fqs = ULONG_MAX;
223 223
224module_param(jiffies_till_first_fqs, ulong, 0644); 224module_param(jiffies_till_first_fqs, ulong, 0644);
225module_param(jiffies_till_next_fqs, ulong, 0644); 225module_param(jiffies_till_next_fqs, ulong, 0644);
@@ -866,7 +866,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
866 * See Documentation/RCU/stallwarn.txt for info on how to debug 866 * See Documentation/RCU/stallwarn.txt for info on how to debug
867 * RCU CPU stall warnings. 867 * RCU CPU stall warnings.
868 */ 868 */
869 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:", 869 pr_err("INFO: %s detected stalls on CPUs/tasks:",
870 rsp->name); 870 rsp->name);
871 print_cpu_stall_info_begin(); 871 print_cpu_stall_info_begin();
872 rcu_for_each_leaf_node(rsp, rnp) { 872 rcu_for_each_leaf_node(rsp, rnp) {
@@ -899,7 +899,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
899 smp_processor_id(), (long)(jiffies - rsp->gp_start), 899 smp_processor_id(), (long)(jiffies - rsp->gp_start),
900 rsp->gpnum, rsp->completed, totqlen); 900 rsp->gpnum, rsp->completed, totqlen);
901 if (ndetected == 0) 901 if (ndetected == 0)
902 printk(KERN_ERR "INFO: Stall ended before state dump start\n"); 902 pr_err("INFO: Stall ended before state dump start\n");
903 else if (!trigger_all_cpu_backtrace()) 903 else if (!trigger_all_cpu_backtrace())
904 rcu_dump_cpu_stacks(rsp); 904 rcu_dump_cpu_stacks(rsp);
905 905
@@ -922,7 +922,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
922 * See Documentation/RCU/stallwarn.txt for info on how to debug 922 * See Documentation/RCU/stallwarn.txt for info on how to debug
923 * RCU CPU stall warnings. 923 * RCU CPU stall warnings.
924 */ 924 */
925 printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name); 925 pr_err("INFO: %s self-detected stall on CPU", rsp->name);
926 print_cpu_stall_info_begin(); 926 print_cpu_stall_info_begin();
927 print_cpu_stall_info(rsp, smp_processor_id()); 927 print_cpu_stall_info(rsp, smp_processor_id());
928 print_cpu_stall_info_end(); 928 print_cpu_stall_info_end();
@@ -985,65 +985,6 @@ void rcu_cpu_stall_reset(void)
985} 985}
986 986
987/* 987/*
988 * Update CPU-local rcu_data state to record the newly noticed grace period.
989 * This is used both when we started the grace period and when we notice
990 * that someone else started the grace period. The caller must hold the
991 * ->lock of the leaf rcu_node structure corresponding to the current CPU,
992 * and must have irqs disabled.
993 */
994static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
995{
996 if (rdp->gpnum != rnp->gpnum) {
997 /*
998 * If the current grace period is waiting for this CPU,
999 * set up to detect a quiescent state, otherwise don't
1000 * go looking for one.
1001 */
1002 rdp->gpnum = rnp->gpnum;
1003 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
1004 rdp->passed_quiesce = 0;
1005 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
1006 zero_cpu_stall_ticks(rdp);
1007 }
1008}
1009
1010static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
1011{
1012 unsigned long flags;
1013 struct rcu_node *rnp;
1014
1015 local_irq_save(flags);
1016 rnp = rdp->mynode;
1017 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
1018 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
1019 local_irq_restore(flags);
1020 return;
1021 }
1022 __note_new_gpnum(rsp, rnp, rdp);
1023 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1024}
1025
1026/*
1027 * Did someone else start a new RCU grace period start since we last
1028 * checked? Update local state appropriately if so. Must be called
1029 * on the CPU corresponding to rdp.
1030 */
1031static int
1032check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
1033{
1034 unsigned long flags;
1035 int ret = 0;
1036
1037 local_irq_save(flags);
1038 if (rdp->gpnum != rsp->gpnum) {
1039 note_new_gpnum(rsp, rdp);
1040 ret = 1;
1041 }
1042 local_irq_restore(flags);
1043 return ret;
1044}
1045
1046/*
1047 * Initialize the specified rcu_data structure's callback list to empty. 988 * Initialize the specified rcu_data structure's callback list to empty.
1048 */ 989 */
1049static void init_callback_list(struct rcu_data *rdp) 990static void init_callback_list(struct rcu_data *rdp)
@@ -1313,18 +1254,16 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1313} 1254}
1314 1255
1315/* 1256/*
1316 * Advance this CPU's callbacks, but only if the current grace period 1257 * Update CPU-local rcu_data state to record the beginnings and ends of
1317 * has ended. This may be called only from the CPU to whom the rdp 1258 * grace periods. The caller must hold the ->lock of the leaf rcu_node
1318 * belongs. In addition, the corresponding leaf rcu_node structure's 1259 * structure corresponding to the current CPU, and must have irqs disabled.
1319 * ->lock must be held by the caller, with irqs disabled.
1320 */ 1260 */
1321static void 1261static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
1322__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
1323{ 1262{
1324 /* Did another grace period end? */ 1263 /* Handle the ends of any preceding grace periods first. */
1325 if (rdp->completed == rnp->completed) { 1264 if (rdp->completed == rnp->completed) {
1326 1265
1327 /* No, so just accelerate recent callbacks. */ 1266 /* No grace period end, so just accelerate recent callbacks. */
1328 rcu_accelerate_cbs(rsp, rnp, rdp); 1267 rcu_accelerate_cbs(rsp, rnp, rdp);
1329 1268
1330 } else { 1269 } else {
@@ -1335,68 +1274,40 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
1335 /* Remember that we saw this grace-period completion. */ 1274 /* Remember that we saw this grace-period completion. */
1336 rdp->completed = rnp->completed; 1275 rdp->completed = rnp->completed;
1337 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); 1276 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
1277 }
1338 1278
1279 if (rdp->gpnum != rnp->gpnum) {
1339 /* 1280 /*
1340 * If we were in an extended quiescent state, we may have 1281 * If the current grace period is waiting for this CPU,
1341 * missed some grace periods that others CPUs handled on 1282 * set up to detect a quiescent state, otherwise don't
1342 * our behalf. Catch up with this state to avoid noting 1283 * go looking for one.
1343 * spurious new grace periods. If another grace period
1344 * has started, then rnp->gpnum will have advanced, so
1345 * we will detect this later on. Of course, any quiescent
1346 * states we found for the old GP are now invalid.
1347 */
1348 if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) {
1349 rdp->gpnum = rdp->completed;
1350 rdp->passed_quiesce = 0;
1351 }
1352
1353 /*
1354 * If RCU does not need a quiescent state from this CPU,
1355 * then make sure that this CPU doesn't go looking for one.
1356 */ 1284 */
1357 if ((rnp->qsmask & rdp->grpmask) == 0) 1285 rdp->gpnum = rnp->gpnum;
1358 rdp->qs_pending = 0; 1286 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
1287 rdp->passed_quiesce = 0;
1288 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
1289 zero_cpu_stall_ticks(rdp);
1359 } 1290 }
1360} 1291}
1361 1292
1362/* 1293static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1363 * Advance this CPU's callbacks, but only if the current grace period
1364 * has ended. This may be called only from the CPU to whom the rdp
1365 * belongs.
1366 */
1367static void
1368rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
1369{ 1294{
1370 unsigned long flags; 1295 unsigned long flags;
1371 struct rcu_node *rnp; 1296 struct rcu_node *rnp;
1372 1297
1373 local_irq_save(flags); 1298 local_irq_save(flags);
1374 rnp = rdp->mynode; 1299 rnp = rdp->mynode;
1375 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ 1300 if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
1301 rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */
1376 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ 1302 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
1377 local_irq_restore(flags); 1303 local_irq_restore(flags);
1378 return; 1304 return;
1379 } 1305 }
1380 __rcu_process_gp_end(rsp, rnp, rdp); 1306 __note_gp_changes(rsp, rnp, rdp);
1381 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1307 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1382} 1308}
1383 1309
1384/* 1310/*
1385 * Do per-CPU grace-period initialization for running CPU. The caller
1386 * must hold the lock of the leaf rcu_node structure corresponding to
1387 * this CPU.
1388 */
1389static void
1390rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
1391{
1392 /* Prior grace period ended, so advance callbacks for current CPU. */
1393 __rcu_process_gp_end(rsp, rnp, rdp);
1394
1395 /* Set state so that this CPU will detect the next quiescent state. */
1396 __note_new_gpnum(rsp, rnp, rdp);
1397}
1398
1399/*
1400 * Initialize a new grace period. 1311 * Initialize a new grace period.
1401 */ 1312 */
1402static int rcu_gp_init(struct rcu_state *rsp) 1313static int rcu_gp_init(struct rcu_state *rsp)
@@ -1444,16 +1355,16 @@ static int rcu_gp_init(struct rcu_state *rsp)
1444 WARN_ON_ONCE(rnp->completed != rsp->completed); 1355 WARN_ON_ONCE(rnp->completed != rsp->completed);
1445 ACCESS_ONCE(rnp->completed) = rsp->completed; 1356 ACCESS_ONCE(rnp->completed) = rsp->completed;
1446 if (rnp == rdp->mynode) 1357 if (rnp == rdp->mynode)
1447 rcu_start_gp_per_cpu(rsp, rnp, rdp); 1358 __note_gp_changes(rsp, rnp, rdp);
1448 rcu_preempt_boost_start_gp(rnp); 1359 rcu_preempt_boost_start_gp(rnp);
1449 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 1360 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
1450 rnp->level, rnp->grplo, 1361 rnp->level, rnp->grplo,
1451 rnp->grphi, rnp->qsmask); 1362 rnp->grphi, rnp->qsmask);
1452 raw_spin_unlock_irq(&rnp->lock); 1363 raw_spin_unlock_irq(&rnp->lock);
1453#ifdef CONFIG_PROVE_RCU_DELAY 1364#ifdef CONFIG_PROVE_RCU_DELAY
1454 if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 && 1365 if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 &&
1455 system_state == SYSTEM_RUNNING) 1366 system_state == SYSTEM_RUNNING)
1456 schedule_timeout_uninterruptible(2); 1367 udelay(200);
1457#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ 1368#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
1458 cond_resched(); 1369 cond_resched();
1459 } 1370 }
@@ -1527,7 +1438,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1527 ACCESS_ONCE(rnp->completed) = rsp->gpnum; 1438 ACCESS_ONCE(rnp->completed) = rsp->gpnum;
1528 rdp = this_cpu_ptr(rsp->rda); 1439 rdp = this_cpu_ptr(rsp->rda);
1529 if (rnp == rdp->mynode) 1440 if (rnp == rdp->mynode)
1530 __rcu_process_gp_end(rsp, rnp, rdp); 1441 __note_gp_changes(rsp, rnp, rdp);
1531 nocb += rcu_future_gp_cleanup(rsp, rnp); 1442 nocb += rcu_future_gp_cleanup(rsp, rnp);
1532 raw_spin_unlock_irq(&rnp->lock); 1443 raw_spin_unlock_irq(&rnp->lock);
1533 cond_resched(); 1444 cond_resched();
@@ -1613,6 +1524,14 @@ static int __noreturn rcu_gp_kthread(void *arg)
1613 } 1524 }
1614} 1525}
1615 1526
1527static void rsp_wakeup(struct irq_work *work)
1528{
1529 struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work);
1530
1531 /* Wake up rcu_gp_kthread() to start the grace period. */
1532 wake_up(&rsp->gp_wq);
1533}
1534
1616/* 1535/*
1617 * Start a new RCU grace period if warranted, re-initializing the hierarchy 1536 * Start a new RCU grace period if warranted, re-initializing the hierarchy
1618 * in preparation for detecting the next grace period. The caller must hold 1537 * in preparation for detecting the next grace period. The caller must hold
@@ -1637,8 +1556,12 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1637 } 1556 }
1638 rsp->gp_flags = RCU_GP_FLAG_INIT; 1557 rsp->gp_flags = RCU_GP_FLAG_INIT;
1639 1558
1640 /* Wake up rcu_gp_kthread() to start the grace period. */ 1559 /*
1641 wake_up(&rsp->gp_wq); 1560 * We can't do wakeups while holding the rnp->lock, as that
1561 * could cause possible deadlocks with the rq->lock. Deter
1562 * the wakeup to interrupt context.
1563 */
1564 irq_work_queue(&rsp->wakeup_work);
1642} 1565}
1643 1566
1644/* 1567/*
@@ -1793,9 +1716,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1793static void 1716static void
1794rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) 1717rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1795{ 1718{
1796 /* If there is now a new grace period, record and return. */ 1719 /* Check for grace-period ends and beginnings. */
1797 if (check_for_new_grace_period(rsp, rdp)) 1720 note_gp_changes(rsp, rdp);
1798 return;
1799 1721
1800 /* 1722 /*
1801 * Does this CPU still need to do its part for current grace period? 1723 * Does this CPU still need to do its part for current grace period?
@@ -2259,9 +2181,6 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2259 2181
2260 WARN_ON_ONCE(rdp->beenonline == 0); 2182 WARN_ON_ONCE(rdp->beenonline == 0);
2261 2183
2262 /* Handle the end of a grace period that some other CPU ended. */
2263 rcu_process_gp_end(rsp, rdp);
2264
2265 /* Update RCU state based on any recent quiescent states. */ 2184 /* Update RCU state based on any recent quiescent states. */
2266 rcu_check_quiescent_state(rsp, rdp); 2185 rcu_check_quiescent_state(rsp, rdp);
2267 2186
@@ -2346,8 +2265,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2346 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 2265 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
2347 2266
2348 /* Are we ignoring a completed grace period? */ 2267 /* Are we ignoring a completed grace period? */
2349 rcu_process_gp_end(rsp, rdp); 2268 note_gp_changes(rsp, rdp);
2350 check_for_new_grace_period(rsp, rdp);
2351 2269
2352 /* Start a new grace period if one not already started. */ 2270 /* Start a new grace period if one not already started. */
2353 if (!rcu_gp_in_progress(rsp)) { 2271 if (!rcu_gp_in_progress(rsp)) {
@@ -3108,7 +3026,7 @@ static int __init rcu_spawn_gp_kthread(void)
3108 struct task_struct *t; 3026 struct task_struct *t;
3109 3027
3110 for_each_rcu_flavor(rsp) { 3028 for_each_rcu_flavor(rsp) {
3111 t = kthread_run(rcu_gp_kthread, rsp, rsp->name); 3029 t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
3112 BUG_ON(IS_ERR(t)); 3030 BUG_ON(IS_ERR(t));
3113 rnp = rcu_get_root(rsp); 3031 rnp = rcu_get_root(rsp);
3114 raw_spin_lock_irqsave(&rnp->lock, flags); 3032 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -3235,6 +3153,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3235 3153
3236 rsp->rda = rda; 3154 rsp->rda = rda;
3237 init_waitqueue_head(&rsp->gp_wq); 3155 init_waitqueue_head(&rsp->gp_wq);
3156 init_irq_work(&rsp->wakeup_work, rsp_wakeup);
3238 rnp = rsp->level[rcu_num_lvls - 1]; 3157 rnp = rsp->level[rcu_num_lvls - 1];
3239 for_each_possible_cpu(i) { 3158 for_each_possible_cpu(i) {
3240 while (i > rnp->grphi) 3159 while (i > rnp->grphi)
@@ -3252,11 +3171,25 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3252 */ 3171 */
3253static void __init rcu_init_geometry(void) 3172static void __init rcu_init_geometry(void)
3254{ 3173{
3174 ulong d;
3255 int i; 3175 int i;
3256 int j; 3176 int j;
3257 int n = nr_cpu_ids; 3177 int n = nr_cpu_ids;
3258 int rcu_capacity[MAX_RCU_LVLS + 1]; 3178 int rcu_capacity[MAX_RCU_LVLS + 1];
3259 3179
3180 /*
3181 * Initialize any unspecified boot parameters.
3182 * The default values of jiffies_till_first_fqs and
3183 * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS
3184 * value, which is a function of HZ, then adding one for each
3185 * RCU_JIFFIES_FQS_DIV CPUs that might be on the system.
3186 */
3187 d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
3188 if (jiffies_till_first_fqs == ULONG_MAX)
3189 jiffies_till_first_fqs = d;
3190 if (jiffies_till_next_fqs == ULONG_MAX)
3191 jiffies_till_next_fqs = d;
3192
3260 /* If the compile-time values are accurate, just leave. */ 3193 /* If the compile-time values are accurate, just leave. */
3261 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && 3194 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
3262 nr_cpu_ids == NR_CPUS) 3195 nr_cpu_ids == NR_CPUS)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index da77a8f57ff9..4a39d364493c 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -27,6 +27,7 @@
27#include <linux/threads.h> 27#include <linux/threads.h>
28#include <linux/cpumask.h> 28#include <linux/cpumask.h>
29#include <linux/seqlock.h> 29#include <linux/seqlock.h>
30#include <linux/irq_work.h>
30 31
31/* 32/*
32 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and 33 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -342,12 +343,17 @@ struct rcu_data {
342#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ 343#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
343#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 344#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
344 345
345#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 346#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
347 /* For jiffies_till_first_fqs and */
348 /* and jiffies_till_next_fqs. */
346 349
347#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 350#define RCU_JIFFIES_FQS_DIV 256 /* Very large systems need more */
348 /* to take at least one */ 351 /* delay between bouts of */
349 /* scheduling clock irq */ 352 /* quiescent-state forcing. */
350 /* before ratting on them. */ 353
354#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time to take */
355 /* at least one scheduling clock */
356 /* irq before ratting on them. */
351 357
352#define rcu_wait(cond) \ 358#define rcu_wait(cond) \
353do { \ 359do { \
@@ -442,6 +448,7 @@ struct rcu_state {
442 char *name; /* Name of structure. */ 448 char *name; /* Name of structure. */
443 char abbr; /* Abbreviated name. */ 449 char abbr; /* Abbreviated name. */
444 struct list_head flavors; /* List of RCU flavors. */ 450 struct list_head flavors; /* List of RCU flavors. */
451 struct irq_work wakeup_work; /* Postponed wakeups */
445}; 452};
446 453
447/* Values for rcu_state structure's gp_flags field. */ 454/* Values for rcu_state structure's gp_flags field. */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3db5a375d8dd..63098a59216e 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -53,38 +53,37 @@ static char __initdata nocb_buf[NR_CPUS * 5];
53static void __init rcu_bootup_announce_oddness(void) 53static void __init rcu_bootup_announce_oddness(void)
54{ 54{
55#ifdef CONFIG_RCU_TRACE 55#ifdef CONFIG_RCU_TRACE
56 printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n"); 56 pr_info("\tRCU debugfs-based tracing is enabled.\n");
57#endif 57#endif
58#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) 58#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
59 printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n", 59 pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
60 CONFIG_RCU_FANOUT); 60 CONFIG_RCU_FANOUT);
61#endif 61#endif
62#ifdef CONFIG_RCU_FANOUT_EXACT 62#ifdef CONFIG_RCU_FANOUT_EXACT
63 printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n"); 63 pr_info("\tHierarchical RCU autobalancing is disabled.\n");
64#endif 64#endif
65#ifdef CONFIG_RCU_FAST_NO_HZ 65#ifdef CONFIG_RCU_FAST_NO_HZ
66 printk(KERN_INFO 66 pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
67 "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
68#endif 67#endif
69#ifdef CONFIG_PROVE_RCU 68#ifdef CONFIG_PROVE_RCU
70 printk(KERN_INFO "\tRCU lockdep checking is enabled.\n"); 69 pr_info("\tRCU lockdep checking is enabled.\n");
71#endif 70#endif
72#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE 71#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
73 printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); 72 pr_info("\tRCU torture testing starts during boot.\n");
74#endif 73#endif
75#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) 74#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
76 printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n"); 75 pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n");
77#endif 76#endif
78#if defined(CONFIG_RCU_CPU_STALL_INFO) 77#if defined(CONFIG_RCU_CPU_STALL_INFO)
79 printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); 78 pr_info("\tAdditional per-CPU info printed with stalls.\n");
80#endif 79#endif
81#if NUM_RCU_LVL_4 != 0 80#if NUM_RCU_LVL_4 != 0
82 printk(KERN_INFO "\tFour-level hierarchy is enabled.\n"); 81 pr_info("\tFour-level hierarchy is enabled.\n");
83#endif 82#endif
84 if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) 83 if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
85 printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); 84 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
86 if (nr_cpu_ids != NR_CPUS) 85 if (nr_cpu_ids != NR_CPUS)
87 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 86 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
88#ifdef CONFIG_RCU_NOCB_CPU 87#ifdef CONFIG_RCU_NOCB_CPU
89#ifndef CONFIG_RCU_NOCB_CPU_NONE 88#ifndef CONFIG_RCU_NOCB_CPU_NONE
90 if (!have_rcu_nocb_mask) { 89 if (!have_rcu_nocb_mask) {
@@ -92,19 +91,19 @@ static void __init rcu_bootup_announce_oddness(void)
92 have_rcu_nocb_mask = true; 91 have_rcu_nocb_mask = true;
93 } 92 }
94#ifdef CONFIG_RCU_NOCB_CPU_ZERO 93#ifdef CONFIG_RCU_NOCB_CPU_ZERO
95 pr_info("\tExperimental no-CBs CPU 0\n"); 94 pr_info("\tOffload RCU callbacks from CPU 0\n");
96 cpumask_set_cpu(0, rcu_nocb_mask); 95 cpumask_set_cpu(0, rcu_nocb_mask);
97#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ 96#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
98#ifdef CONFIG_RCU_NOCB_CPU_ALL 97#ifdef CONFIG_RCU_NOCB_CPU_ALL
99 pr_info("\tExperimental no-CBs for all CPUs\n"); 98 pr_info("\tOffload RCU callbacks from all CPUs\n");
100 cpumask_setall(rcu_nocb_mask); 99 cpumask_setall(rcu_nocb_mask);
101#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ 100#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
102#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ 101#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
103 if (have_rcu_nocb_mask) { 102 if (have_rcu_nocb_mask) {
104 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); 103 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
105 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); 104 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
106 if (rcu_nocb_poll) 105 if (rcu_nocb_poll)
107 pr_info("\tExperimental polled no-CBs CPUs.\n"); 106 pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
108 } 107 }
109#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 108#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
110} 109}
@@ -123,7 +122,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
123 */ 122 */
124static void __init rcu_bootup_announce(void) 123static void __init rcu_bootup_announce(void)
125{ 124{
126 printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n"); 125 pr_info("Preemptible hierarchical RCU implementation.\n");
127 rcu_bootup_announce_oddness(); 126 rcu_bootup_announce_oddness();
128} 127}
129 128
@@ -490,13 +489,13 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
490 489
491static void rcu_print_task_stall_begin(struct rcu_node *rnp) 490static void rcu_print_task_stall_begin(struct rcu_node *rnp)
492{ 491{
493 printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", 492 pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
494 rnp->level, rnp->grplo, rnp->grphi); 493 rnp->level, rnp->grplo, rnp->grphi);
495} 494}
496 495
497static void rcu_print_task_stall_end(void) 496static void rcu_print_task_stall_end(void)
498{ 497{
499 printk(KERN_CONT "\n"); 498 pr_cont("\n");
500} 499}
501 500
502#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ 501#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
@@ -526,7 +525,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
526 t = list_entry(rnp->gp_tasks, 525 t = list_entry(rnp->gp_tasks,
527 struct task_struct, rcu_node_entry); 526 struct task_struct, rcu_node_entry);
528 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { 527 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
529 printk(KERN_CONT " P%d", t->pid); 528 pr_cont(" P%d", t->pid);
530 ndetected++; 529 ndetected++;
531 } 530 }
532 rcu_print_task_stall_end(); 531 rcu_print_task_stall_end();
@@ -933,6 +932,24 @@ static void __init __rcu_init_preempt(void)
933 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); 932 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
934} 933}
935 934
935/*
936 * Check for a task exiting while in a preemptible-RCU read-side
937 * critical section, clean up if so. No need to issue warnings,
938 * as debug_check_no_locks_held() already does this if lockdep
939 * is enabled.
940 */
941void exit_rcu(void)
942{
943 struct task_struct *t = current;
944
945 if (likely(list_empty(&current->rcu_node_entry)))
946 return;
947 t->rcu_read_lock_nesting = 1;
948 barrier();
949 t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
950 __rcu_read_unlock();
951}
952
936#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 953#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
937 954
938static struct rcu_state *rcu_state = &rcu_sched_state; 955static struct rcu_state *rcu_state = &rcu_sched_state;
@@ -942,7 +959,7 @@ static struct rcu_state *rcu_state = &rcu_sched_state;
942 */ 959 */
943static void __init rcu_bootup_announce(void) 960static void __init rcu_bootup_announce(void)
944{ 961{
945 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 962 pr_info("Hierarchical RCU implementation.\n");
946 rcu_bootup_announce_oddness(); 963 rcu_bootup_announce_oddness();
947} 964}
948 965
@@ -1101,6 +1118,14 @@ static void __init __rcu_init_preempt(void)
1101{ 1118{
1102} 1119}
1103 1120
1121/*
1122 * Because preemptible RCU does not exist, tasks cannot possibly exit
1123 * while in preemptible RCU read-side critical sections.
1124 */
1125void exit_rcu(void)
1126{
1127}
1128
1104#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1129#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1105 1130
1106#ifdef CONFIG_RCU_BOOST 1131#ifdef CONFIG_RCU_BOOST
@@ -1629,7 +1654,7 @@ static bool rcu_try_advance_all_cbs(void)
1629 */ 1654 */
1630 if (rdp->completed != rnp->completed && 1655 if (rdp->completed != rnp->completed &&
1631 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) 1656 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
1632 rcu_process_gp_end(rsp, rdp); 1657 note_gp_changes(rsp, rdp);
1633 1658
1634 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1659 if (cpu_has_callbacks_ready_to_invoke(rdp))
1635 cbs_ready = true; 1660 cbs_ready = true;
@@ -1883,7 +1908,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
1883/* Initiate the stall-info list. */ 1908/* Initiate the stall-info list. */
1884static void print_cpu_stall_info_begin(void) 1909static void print_cpu_stall_info_begin(void)
1885{ 1910{
1886 printk(KERN_CONT "\n"); 1911 pr_cont("\n");
1887} 1912}
1888 1913
1889/* 1914/*
@@ -1914,7 +1939,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1914 ticks_value = rsp->gpnum - rdp->gpnum; 1939 ticks_value = rsp->gpnum - rdp->gpnum;
1915 } 1940 }
1916 print_cpu_stall_fast_no_hz(fast_no_hz, cpu); 1941 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
1917 printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", 1942 pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
1918 cpu, ticks_value, ticks_title, 1943 cpu, ticks_value, ticks_title,
1919 atomic_read(&rdtp->dynticks) & 0xfff, 1944 atomic_read(&rdtp->dynticks) & 0xfff,
1920 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, 1945 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
@@ -1925,7 +1950,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1925/* Terminate the stall-info list. */ 1950/* Terminate the stall-info list. */
1926static void print_cpu_stall_info_end(void) 1951static void print_cpu_stall_info_end(void)
1927{ 1952{
1928 printk(KERN_ERR "\t"); 1953 pr_err("\t");
1929} 1954}
1930 1955
1931/* Zero ->ticks_this_gp for all flavors of RCU. */ 1956/* Zero ->ticks_this_gp for all flavors of RCU. */
@@ -1948,17 +1973,17 @@ static void increment_cpu_stall_ticks(void)
1948 1973
1949static void print_cpu_stall_info_begin(void) 1974static void print_cpu_stall_info_begin(void)
1950{ 1975{
1951 printk(KERN_CONT " {"); 1976 pr_cont(" {");
1952} 1977}
1953 1978
1954static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) 1979static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1955{ 1980{
1956 printk(KERN_CONT " %d", cpu); 1981 pr_cont(" %d", cpu);
1957} 1982}
1958 1983
1959static void print_cpu_stall_info_end(void) 1984static void print_cpu_stall_info_end(void)
1960{ 1985{
1961 printk(KERN_CONT "} "); 1986 pr_cont("} ");
1962} 1987}
1963 1988
1964static void zero_cpu_stall_ticks(struct rcu_data *rdp) 1989static void zero_cpu_stall_ticks(struct rcu_data *rdp)
diff --git a/kernel/resource.c b/kernel/resource.c
index d7386986e10e..3f285dce9347 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -409,6 +409,7 @@ int __weak page_is_ram(unsigned long pfn)
409{ 409{
410 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; 410 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
411} 411}
412EXPORT_SYMBOL_GPL(page_is_ram);
412 413
413void __weak arch_remove_reservations(struct resource *avail) 414void __weak arch_remove_reservations(struct resource *avail)
414{ 415{
@@ -448,7 +449,6 @@ static int __find_resource(struct resource *root, struct resource *old,
448 struct resource *this = root->child; 449 struct resource *this = root->child;
449 struct resource tmp = *new, avail, alloc; 450 struct resource tmp = *new, avail, alloc;
450 451
451 tmp.flags = new->flags;
452 tmp.start = root->start; 452 tmp.start = root->start;
453 /* 453 /*
454 * Skip past an allocated resource that starts at 0, since the assignment 454 * Skip past an allocated resource that starts at 0, since the assignment
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 1e09308bf2a1..0dd6aec1cb6a 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -145,6 +145,19 @@ int max_lock_depth = 1024;
145/* 145/*
146 * Adjust the priority chain. Also used for deadlock detection. 146 * Adjust the priority chain. Also used for deadlock detection.
147 * Decreases task's usage by one - may thus free the task. 147 * Decreases task's usage by one - may thus free the task.
148 *
149 * @task: the task owning the mutex (owner) for which a chain walk is probably
150 * needed
151 * @deadlock_detect: do we have to carry out deadlock detection?
152 * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
153 * things for a task that has just got its priority adjusted, and
154 * is waiting on a mutex)
155 * @orig_waiter: rt_mutex_waiter struct for the task that has just donated
156 * its priority to the mutex owner (can be NULL in the case
157 * depicted above or if the top waiter is gone away and we are
158 * actually deboosting the owner)
159 * @top_task: the current top waiter
160 *
148 * Returns 0 or -EDEADLK. 161 * Returns 0 or -EDEADLK.
149 */ 162 */
150static int rt_mutex_adjust_prio_chain(struct task_struct *task, 163static int rt_mutex_adjust_prio_chain(struct task_struct *task,
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index deaf90e4a1de..54adcf35f495 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer 11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o 14obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
15obj-$(CONFIG_SMP) += cpupri.o 15obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 17obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 64de5f8b0c9e..4a073539c58e 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -77,8 +77,6 @@ static inline struct autogroup *autogroup_create(void)
77 if (IS_ERR(tg)) 77 if (IS_ERR(tg))
78 goto out_free; 78 goto out_free;
79 79
80 sched_online_group(tg, &root_task_group);
81
82 kref_init(&ag->kref); 80 kref_init(&ag->kref);
83 init_rwsem(&ag->lock); 81 init_rwsem(&ag->lock);
84 ag->id = atomic_inc_return(&autogroup_seq_nr); 82 ag->id = atomic_inc_return(&autogroup_seq_nr);
@@ -98,6 +96,7 @@ static inline struct autogroup *autogroup_create(void)
98#endif 96#endif
99 tg->autogroup = ag; 97 tg->autogroup = ag;
100 98
99 sched_online_group(tg, &root_task_group);
101 return ag; 100 return ag;
102 101
103out_free: 102out_free:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 58453b8272fd..9b1f2e533b95 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -633,7 +633,19 @@ void wake_up_nohz_cpu(int cpu)
633static inline bool got_nohz_idle_kick(void) 633static inline bool got_nohz_idle_kick(void)
634{ 634{
635 int cpu = smp_processor_id(); 635 int cpu = smp_processor_id();
636 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 636
637 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
638 return false;
639
640 if (idle_cpu(cpu) && !need_resched())
641 return true;
642
643 /*
644 * We can't run Idle Load Balance on this CPU for this time so we
645 * cancel it and clear NOHZ_BALANCE_KICK
646 */
647 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
648 return false;
637} 649}
638 650
639#else /* CONFIG_NO_HZ_COMMON */ 651#else /* CONFIG_NO_HZ_COMMON */
@@ -667,7 +679,7 @@ void sched_avg_update(struct rq *rq)
667{ 679{
668 s64 period = sched_avg_period(); 680 s64 period = sched_avg_period();
669 681
670 while ((s64)(rq->clock - rq->age_stamp) > period) { 682 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
671 /* 683 /*
672 * Inline assembly required to prevent the compiler 684 * Inline assembly required to prevent the compiler
673 * optimising this loop into a divmod call. 685 * optimising this loop into a divmod call.
@@ -1328,7 +1340,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1328 p->sched_class->task_woken(rq, p); 1340 p->sched_class->task_woken(rq, p);
1329 1341
1330 if (rq->idle_stamp) { 1342 if (rq->idle_stamp) {
1331 u64 delta = rq->clock - rq->idle_stamp; 1343 u64 delta = rq_clock(rq) - rq->idle_stamp;
1332 u64 max = 2*sysctl_sched_migration_cost; 1344 u64 max = 2*sysctl_sched_migration_cost;
1333 1345
1334 if (delta > max) 1346 if (delta > max)
@@ -1365,6 +1377,8 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
1365 1377
1366 rq = __task_rq_lock(p); 1378 rq = __task_rq_lock(p);
1367 if (p->on_rq) { 1379 if (p->on_rq) {
1380 /* check_preempt_curr() may use rq clock */
1381 update_rq_clock(rq);
1368 ttwu_do_wakeup(rq, p, wake_flags); 1382 ttwu_do_wakeup(rq, p, wake_flags);
1369 ret = 1; 1383 ret = 1;
1370 } 1384 }
@@ -1393,8 +1407,9 @@ static void sched_ttwu_pending(void)
1393 1407
1394void scheduler_ipi(void) 1408void scheduler_ipi(void)
1395{ 1409{
1396 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() 1410 if (llist_empty(&this_rq()->wake_list)
1397 && !tick_nohz_full_cpu(smp_processor_id())) 1411 && !tick_nohz_full_cpu(smp_processor_id())
1412 && !got_nohz_idle_kick())
1398 return; 1413 return;
1399 1414
1400 /* 1415 /*
@@ -1417,7 +1432,7 @@ void scheduler_ipi(void)
1417 /* 1432 /*
1418 * Check if someone kicked us for doing the nohz idle load balance. 1433 * Check if someone kicked us for doing the nohz idle load balance.
1419 */ 1434 */
1420 if (unlikely(got_nohz_idle_kick() && !need_resched())) { 1435 if (unlikely(got_nohz_idle_kick())) {
1421 this_rq()->idle_balance = 1; 1436 this_rq()->idle_balance = 1;
1422 raise_softirq_irqoff(SCHED_SOFTIRQ); 1437 raise_softirq_irqoff(SCHED_SOFTIRQ);
1423 } 1438 }
@@ -1596,15 +1611,6 @@ static void __sched_fork(struct task_struct *p)
1596 p->se.vruntime = 0; 1611 p->se.vruntime = 0;
1597 INIT_LIST_HEAD(&p->se.group_node); 1612 INIT_LIST_HEAD(&p->se.group_node);
1598 1613
1599/*
1600 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
1601 * removed when useful for applications beyond shares distribution (e.g.
1602 * load-balance).
1603 */
1604#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1605 p->se.avg.runnable_avg_period = 0;
1606 p->se.avg.runnable_avg_sum = 0;
1607#endif
1608#ifdef CONFIG_SCHEDSTATS 1614#ifdef CONFIG_SCHEDSTATS
1609 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1615 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1610#endif 1616#endif
@@ -1748,6 +1754,8 @@ void wake_up_new_task(struct task_struct *p)
1748 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); 1754 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
1749#endif 1755#endif
1750 1756
1757 /* Initialize new task's runnable average */
1758 init_task_runnable_average(p);
1751 rq = __task_rq_lock(p); 1759 rq = __task_rq_lock(p);
1752 activate_task(rq, p, 0); 1760 activate_task(rq, p, 0);
1753 p->on_rq = 1; 1761 p->on_rq = 1;
@@ -2056,575 +2064,6 @@ unsigned long nr_iowait_cpu(int cpu)
2056 return atomic_read(&this->nr_iowait); 2064 return atomic_read(&this->nr_iowait);
2057} 2065}
2058 2066
2059unsigned long this_cpu_load(void)
2060{
2061 struct rq *this = this_rq();
2062 return this->cpu_load[0];
2063}
2064
2065
2066/*
2067 * Global load-average calculations
2068 *
2069 * We take a distributed and async approach to calculating the global load-avg
2070 * in order to minimize overhead.
2071 *
2072 * The global load average is an exponentially decaying average of nr_running +
2073 * nr_uninterruptible.
2074 *
2075 * Once every LOAD_FREQ:
2076 *
2077 * nr_active = 0;
2078 * for_each_possible_cpu(cpu)
2079 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
2080 *
2081 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
2082 *
2083 * Due to a number of reasons the above turns in the mess below:
2084 *
2085 * - for_each_possible_cpu() is prohibitively expensive on machines with
2086 * serious number of cpus, therefore we need to take a distributed approach
2087 * to calculating nr_active.
2088 *
2089 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
2090 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
2091 *
2092 * So assuming nr_active := 0 when we start out -- true per definition, we
2093 * can simply take per-cpu deltas and fold those into a global accumulate
2094 * to obtain the same result. See calc_load_fold_active().
2095 *
2096 * Furthermore, in order to avoid synchronizing all per-cpu delta folding
2097 * across the machine, we assume 10 ticks is sufficient time for every
2098 * cpu to have completed this task.
2099 *
2100 * This places an upper-bound on the IRQ-off latency of the machine. Then
2101 * again, being late doesn't loose the delta, just wrecks the sample.
2102 *
2103 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
2104 * this would add another cross-cpu cacheline miss and atomic operation
2105 * to the wakeup path. Instead we increment on whatever cpu the task ran
2106 * when it went into uninterruptible state and decrement on whatever cpu
2107 * did the wakeup. This means that only the sum of nr_uninterruptible over
2108 * all cpus yields the correct result.
2109 *
2110 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
2111 */
2112
2113/* Variables and functions for calc_load */
2114static atomic_long_t calc_load_tasks;
2115static unsigned long calc_load_update;
2116unsigned long avenrun[3];
2117EXPORT_SYMBOL(avenrun); /* should be removed */
2118
2119/**
2120 * get_avenrun - get the load average array
2121 * @loads: pointer to dest load array
2122 * @offset: offset to add
2123 * @shift: shift count to shift the result left
2124 *
2125 * These values are estimates at best, so no need for locking.
2126 */
2127void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2128{
2129 loads[0] = (avenrun[0] + offset) << shift;
2130 loads[1] = (avenrun[1] + offset) << shift;
2131 loads[2] = (avenrun[2] + offset) << shift;
2132}
2133
2134static long calc_load_fold_active(struct rq *this_rq)
2135{
2136 long nr_active, delta = 0;
2137
2138 nr_active = this_rq->nr_running;
2139 nr_active += (long) this_rq->nr_uninterruptible;
2140
2141 if (nr_active != this_rq->calc_load_active) {
2142 delta = nr_active - this_rq->calc_load_active;
2143 this_rq->calc_load_active = nr_active;
2144 }
2145
2146 return delta;
2147}
2148
2149/*
2150 * a1 = a0 * e + a * (1 - e)
2151 */
2152static unsigned long
2153calc_load(unsigned long load, unsigned long exp, unsigned long active)
2154{
2155 load *= exp;
2156 load += active * (FIXED_1 - exp);
2157 load += 1UL << (FSHIFT - 1);
2158 return load >> FSHIFT;
2159}
2160
2161#ifdef CONFIG_NO_HZ_COMMON
2162/*
2163 * Handle NO_HZ for the global load-average.
2164 *
2165 * Since the above described distributed algorithm to compute the global
2166 * load-average relies on per-cpu sampling from the tick, it is affected by
2167 * NO_HZ.
2168 *
2169 * The basic idea is to fold the nr_active delta into a global idle-delta upon
2170 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
2171 * when we read the global state.
2172 *
2173 * Obviously reality has to ruin such a delightfully simple scheme:
2174 *
2175 * - When we go NO_HZ idle during the window, we can negate our sample
2176 * contribution, causing under-accounting.
2177 *
2178 * We avoid this by keeping two idle-delta counters and flipping them
2179 * when the window starts, thus separating old and new NO_HZ load.
2180 *
2181 * The only trick is the slight shift in index flip for read vs write.
2182 *
2183 * 0s 5s 10s 15s
2184 * +10 +10 +10 +10
2185 * |-|-----------|-|-----------|-|-----------|-|
2186 * r:0 0 1 1 0 0 1 1 0
2187 * w:0 1 1 0 0 1 1 0 0
2188 *
2189 * This ensures we'll fold the old idle contribution in this window while
2190 * accumlating the new one.
2191 *
2192 * - When we wake up from NO_HZ idle during the window, we push up our
2193 * contribution, since we effectively move our sample point to a known
2194 * busy state.
2195 *
2196 * This is solved by pushing the window forward, and thus skipping the
2197 * sample, for this cpu (effectively using the idle-delta for this cpu which
2198 * was in effect at the time the window opened). This also solves the issue
2199 * of having to deal with a cpu having been in NOHZ idle for multiple
2200 * LOAD_FREQ intervals.
2201 *
2202 * When making the ILB scale, we should try to pull this in as well.
2203 */
2204static atomic_long_t calc_load_idle[2];
2205static int calc_load_idx;
2206
2207static inline int calc_load_write_idx(void)
2208{
2209 int idx = calc_load_idx;
2210
2211 /*
2212 * See calc_global_nohz(), if we observe the new index, we also
2213 * need to observe the new update time.
2214 */
2215 smp_rmb();
2216
2217 /*
2218 * If the folding window started, make sure we start writing in the
2219 * next idle-delta.
2220 */
2221 if (!time_before(jiffies, calc_load_update))
2222 idx++;
2223
2224 return idx & 1;
2225}
2226
2227static inline int calc_load_read_idx(void)
2228{
2229 return calc_load_idx & 1;
2230}
2231
2232void calc_load_enter_idle(void)
2233{
2234 struct rq *this_rq = this_rq();
2235 long delta;
2236
2237 /*
2238 * We're going into NOHZ mode, if there's any pending delta, fold it
2239 * into the pending idle delta.
2240 */
2241 delta = calc_load_fold_active(this_rq);
2242 if (delta) {
2243 int idx = calc_load_write_idx();
2244 atomic_long_add(delta, &calc_load_idle[idx]);
2245 }
2246}
2247
2248void calc_load_exit_idle(void)
2249{
2250 struct rq *this_rq = this_rq();
2251
2252 /*
2253 * If we're still before the sample window, we're done.
2254 */
2255 if (time_before(jiffies, this_rq->calc_load_update))
2256 return;
2257
2258 /*
2259 * We woke inside or after the sample window, this means we're already
2260 * accounted through the nohz accounting, so skip the entire deal and
2261 * sync up for the next window.
2262 */
2263 this_rq->calc_load_update = calc_load_update;
2264 if (time_before(jiffies, this_rq->calc_load_update + 10))
2265 this_rq->calc_load_update += LOAD_FREQ;
2266}
2267
2268static long calc_load_fold_idle(void)
2269{
2270 int idx = calc_load_read_idx();
2271 long delta = 0;
2272
2273 if (atomic_long_read(&calc_load_idle[idx]))
2274 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2275
2276 return delta;
2277}
2278
2279/**
2280 * fixed_power_int - compute: x^n, in O(log n) time
2281 *
2282 * @x: base of the power
2283 * @frac_bits: fractional bits of @x
2284 * @n: power to raise @x to.
2285 *
2286 * By exploiting the relation between the definition of the natural power
2287 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
2288 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
2289 * (where: n_i \elem {0, 1}, the binary vector representing n),
2290 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
2291 * of course trivially computable in O(log_2 n), the length of our binary
2292 * vector.
2293 */
2294static unsigned long
2295fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
2296{
2297 unsigned long result = 1UL << frac_bits;
2298
2299 if (n) for (;;) {
2300 if (n & 1) {
2301 result *= x;
2302 result += 1UL << (frac_bits - 1);
2303 result >>= frac_bits;
2304 }
2305 n >>= 1;
2306 if (!n)
2307 break;
2308 x *= x;
2309 x += 1UL << (frac_bits - 1);
2310 x >>= frac_bits;
2311 }
2312
2313 return result;
2314}
2315
2316/*
2317 * a1 = a0 * e + a * (1 - e)
2318 *
2319 * a2 = a1 * e + a * (1 - e)
2320 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
2321 * = a0 * e^2 + a * (1 - e) * (1 + e)
2322 *
2323 * a3 = a2 * e + a * (1 - e)
2324 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
2325 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
2326 *
2327 * ...
2328 *
2329 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
2330 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
2331 * = a0 * e^n + a * (1 - e^n)
2332 *
2333 * [1] application of the geometric series:
2334 *
2335 * n 1 - x^(n+1)
2336 * S_n := \Sum x^i = -------------
2337 * i=0 1 - x
2338 */
2339static unsigned long
2340calc_load_n(unsigned long load, unsigned long exp,
2341 unsigned long active, unsigned int n)
2342{
2343
2344 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
2345}
2346
2347/*
2348 * NO_HZ can leave us missing all per-cpu ticks calling
2349 * calc_load_account_active(), but since an idle CPU folds its delta into
2350 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
2351 * in the pending idle delta if our idle period crossed a load cycle boundary.
2352 *
2353 * Once we've updated the global active value, we need to apply the exponential
2354 * weights adjusted to the number of cycles missed.
2355 */
2356static void calc_global_nohz(void)
2357{
2358 long delta, active, n;
2359
2360 if (!time_before(jiffies, calc_load_update + 10)) {
2361 /*
2362 * Catch-up, fold however many we are behind still
2363 */
2364 delta = jiffies - calc_load_update - 10;
2365 n = 1 + (delta / LOAD_FREQ);
2366
2367 active = atomic_long_read(&calc_load_tasks);
2368 active = active > 0 ? active * FIXED_1 : 0;
2369
2370 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2371 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2372 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2373
2374 calc_load_update += n * LOAD_FREQ;
2375 }
2376
2377 /*
2378 * Flip the idle index...
2379 *
2380 * Make sure we first write the new time then flip the index, so that
2381 * calc_load_write_idx() will see the new time when it reads the new
2382 * index, this avoids a double flip messing things up.
2383 */
2384 smp_wmb();
2385 calc_load_idx++;
2386}
2387#else /* !CONFIG_NO_HZ_COMMON */
2388
2389static inline long calc_load_fold_idle(void) { return 0; }
2390static inline void calc_global_nohz(void) { }
2391
2392#endif /* CONFIG_NO_HZ_COMMON */
2393
2394/*
2395 * calc_load - update the avenrun load estimates 10 ticks after the
2396 * CPUs have updated calc_load_tasks.
2397 */
2398void calc_global_load(unsigned long ticks)
2399{
2400 long active, delta;
2401
2402 if (time_before(jiffies, calc_load_update + 10))
2403 return;
2404
2405 /*
2406 * Fold the 'old' idle-delta to include all NO_HZ cpus.
2407 */
2408 delta = calc_load_fold_idle();
2409 if (delta)
2410 atomic_long_add(delta, &calc_load_tasks);
2411
2412 active = atomic_long_read(&calc_load_tasks);
2413 active = active > 0 ? active * FIXED_1 : 0;
2414
2415 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2416 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2417 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2418
2419 calc_load_update += LOAD_FREQ;
2420
2421 /*
2422 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
2423 */
2424 calc_global_nohz();
2425}
2426
2427/*
2428 * Called from update_cpu_load() to periodically update this CPU's
2429 * active count.
2430 */
2431static void calc_load_account_active(struct rq *this_rq)
2432{
2433 long delta;
2434
2435 if (time_before(jiffies, this_rq->calc_load_update))
2436 return;
2437
2438 delta = calc_load_fold_active(this_rq);
2439 if (delta)
2440 atomic_long_add(delta, &calc_load_tasks);
2441
2442 this_rq->calc_load_update += LOAD_FREQ;
2443}
2444
2445/*
2446 * End of global load-average stuff
2447 */
2448
2449/*
2450 * The exact cpuload at various idx values, calculated at every tick would be
2451 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
2452 *
2453 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
2454 * on nth tick when cpu may be busy, then we have:
2455 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2456 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
2457 *
2458 * decay_load_missed() below does efficient calculation of
2459 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2460 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
2461 *
2462 * The calculation is approximated on a 128 point scale.
2463 * degrade_zero_ticks is the number of ticks after which load at any
2464 * particular idx is approximated to be zero.
2465 * degrade_factor is a precomputed table, a row for each load idx.
2466 * Each column corresponds to degradation factor for a power of two ticks,
2467 * based on 128 point scale.
2468 * Example:
2469 * row 2, col 3 (=12) says that the degradation at load idx 2 after
2470 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
2471 *
2472 * With this power of 2 load factors, we can degrade the load n times
2473 * by looking at 1 bits in n and doing as many mult/shift instead of
2474 * n mult/shifts needed by the exact degradation.
2475 */
2476#define DEGRADE_SHIFT 7
2477static const unsigned char
2478 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
2479static const unsigned char
2480 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
2481 {0, 0, 0, 0, 0, 0, 0, 0},
2482 {64, 32, 8, 0, 0, 0, 0, 0},
2483 {96, 72, 40, 12, 1, 0, 0},
2484 {112, 98, 75, 43, 15, 1, 0},
2485 {120, 112, 98, 76, 45, 16, 2} };
2486
2487/*
2488 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
2489 * would be when CPU is idle and so we just decay the old load without
2490 * adding any new load.
2491 */
2492static unsigned long
2493decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2494{
2495 int j = 0;
2496
2497 if (!missed_updates)
2498 return load;
2499
2500 if (missed_updates >= degrade_zero_ticks[idx])
2501 return 0;
2502
2503 if (idx == 1)
2504 return load >> missed_updates;
2505
2506 while (missed_updates) {
2507 if (missed_updates % 2)
2508 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
2509
2510 missed_updates >>= 1;
2511 j++;
2512 }
2513 return load;
2514}
2515
2516/*
2517 * Update rq->cpu_load[] statistics. This function is usually called every
2518 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2519 * every tick. We fix it up based on jiffies.
2520 */
2521static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2522 unsigned long pending_updates)
2523{
2524 int i, scale;
2525
2526 this_rq->nr_load_updates++;
2527
2528 /* Update our load: */
2529 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2530 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2531 unsigned long old_load, new_load;
2532
2533 /* scale is effectively 1 << i now, and >> i divides by scale */
2534
2535 old_load = this_rq->cpu_load[i];
2536 old_load = decay_load_missed(old_load, pending_updates - 1, i);
2537 new_load = this_load;
2538 /*
2539 * Round up the averaging division if load is increasing. This
2540 * prevents us from getting stuck on 9 if the load is 10, for
2541 * example.
2542 */
2543 if (new_load > old_load)
2544 new_load += scale - 1;
2545
2546 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
2547 }
2548
2549 sched_avg_update(this_rq);
2550}
2551
2552#ifdef CONFIG_NO_HZ_COMMON
2553/*
2554 * There is no sane way to deal with nohz on smp when using jiffies because the
2555 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
2556 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
2557 *
2558 * Therefore we cannot use the delta approach from the regular tick since that
2559 * would seriously skew the load calculation. However we'll make do for those
2560 * updates happening while idle (nohz_idle_balance) or coming out of idle
2561 * (tick_nohz_idle_exit).
2562 *
2563 * This means we might still be one tick off for nohz periods.
2564 */
2565
2566/*
2567 * Called from nohz_idle_balance() to update the load ratings before doing the
2568 * idle balance.
2569 */
2570void update_idle_cpu_load(struct rq *this_rq)
2571{
2572 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2573 unsigned long load = this_rq->load.weight;
2574 unsigned long pending_updates;
2575
2576 /*
2577 * bail if there's load or we're actually up-to-date.
2578 */
2579 if (load || curr_jiffies == this_rq->last_load_update_tick)
2580 return;
2581
2582 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2583 this_rq->last_load_update_tick = curr_jiffies;
2584
2585 __update_cpu_load(this_rq, load, pending_updates);
2586}
2587
2588/*
2589 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
2590 */
2591void update_cpu_load_nohz(void)
2592{
2593 struct rq *this_rq = this_rq();
2594 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2595 unsigned long pending_updates;
2596
2597 if (curr_jiffies == this_rq->last_load_update_tick)
2598 return;
2599
2600 raw_spin_lock(&this_rq->lock);
2601 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2602 if (pending_updates) {
2603 this_rq->last_load_update_tick = curr_jiffies;
2604 /*
2605 * We were idle, this means load 0, the current load might be
2606 * !0 due to remote wakeups and the sort.
2607 */
2608 __update_cpu_load(this_rq, 0, pending_updates);
2609 }
2610 raw_spin_unlock(&this_rq->lock);
2611}
2612#endif /* CONFIG_NO_HZ_COMMON */
2613
2614/*
2615 * Called from scheduler_tick()
2616 */
2617static void update_cpu_load_active(struct rq *this_rq)
2618{
2619 /*
2620 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
2621 */
2622 this_rq->last_load_update_tick = jiffies;
2623 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2624
2625 calc_load_account_active(this_rq);
2626}
2627
2628#ifdef CONFIG_SMP 2067#ifdef CONFIG_SMP
2629 2068
2630/* 2069/*
@@ -2673,7 +2112,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2673 2112
2674 if (task_current(rq, p)) { 2113 if (task_current(rq, p)) {
2675 update_rq_clock(rq); 2114 update_rq_clock(rq);
2676 ns = rq->clock_task - p->se.exec_start; 2115 ns = rq_clock_task(rq) - p->se.exec_start;
2677 if ((s64)ns < 0) 2116 if ((s64)ns < 0)
2678 ns = 0; 2117 ns = 0;
2679 } 2118 }
@@ -2726,8 +2165,8 @@ void scheduler_tick(void)
2726 2165
2727 raw_spin_lock(&rq->lock); 2166 raw_spin_lock(&rq->lock);
2728 update_rq_clock(rq); 2167 update_rq_clock(rq);
2729 update_cpu_load_active(rq);
2730 curr->sched_class->task_tick(rq, curr, 0); 2168 curr->sched_class->task_tick(rq, curr, 0);
2169 update_cpu_load_active(rq);
2731 raw_spin_unlock(&rq->lock); 2170 raw_spin_unlock(&rq->lock);
2732 2171
2733 perf_event_task_tick(); 2172 perf_event_task_tick();
@@ -4745,7 +4184,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4745 */ 4184 */
4746 idle->sched_class = &idle_sched_class; 4185 idle->sched_class = &idle_sched_class;
4747 ftrace_graph_init_idle_task(idle, cpu); 4186 ftrace_graph_init_idle_task(idle, cpu);
4748 vtime_init_idle(idle); 4187 vtime_init_idle(idle, cpu);
4749#if defined(CONFIG_SMP) 4188#if defined(CONFIG_SMP)
4750 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 4189 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4751#endif 4190#endif
@@ -4947,6 +4386,13 @@ static void migrate_tasks(unsigned int dead_cpu)
4947 */ 4386 */
4948 rq->stop = NULL; 4387 rq->stop = NULL;
4949 4388
4389 /*
4390 * put_prev_task() and pick_next_task() sched
4391 * class method both need to have an up-to-date
4392 * value of rq->clock[_task]
4393 */
4394 update_rq_clock(rq);
4395
4950 for ( ; ; ) { 4396 for ( ; ; ) {
4951 /* 4397 /*
4952 * There's this thread running, bail when that's the only 4398 * There's this thread running, bail when that's the only
@@ -5080,7 +4526,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
5080 return table; 4526 return table;
5081} 4527}
5082 4528
5083static ctl_table *sd_alloc_ctl_cpu_table(int cpu) 4529static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5084{ 4530{
5085 struct ctl_table *entry, *table; 4531 struct ctl_table *entry, *table;
5086 struct sched_domain *sd; 4532 struct sched_domain *sd;
@@ -5894,7 +5340,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
5894 get_group(cpu, sdd, &sd->groups); 5340 get_group(cpu, sdd, &sd->groups);
5895 atomic_inc(&sd->groups->ref); 5341 atomic_inc(&sd->groups->ref);
5896 5342
5897 if (cpu != cpumask_first(sched_domain_span(sd))) 5343 if (cpu != cpumask_first(span))
5898 return 0; 5344 return 0;
5899 5345
5900 lockdep_assert_held(&sched_domains_mutex); 5346 lockdep_assert_held(&sched_domains_mutex);
@@ -5904,12 +5350,12 @@ build_sched_groups(struct sched_domain *sd, int cpu)
5904 5350
5905 for_each_cpu(i, span) { 5351 for_each_cpu(i, span) {
5906 struct sched_group *sg; 5352 struct sched_group *sg;
5907 int group = get_group(i, sdd, &sg); 5353 int group, j;
5908 int j;
5909 5354
5910 if (cpumask_test_cpu(i, covered)) 5355 if (cpumask_test_cpu(i, covered))
5911 continue; 5356 continue;
5912 5357
5358 group = get_group(i, sdd, &sg);
5913 cpumask_clear(sched_group_cpus(sg)); 5359 cpumask_clear(sched_group_cpus(sg));
5914 sg->sgp->power = 0; 5360 sg->sgp->power = 0;
5915 cpumask_setall(sched_group_mask(sg)); 5361 cpumask_setall(sched_group_mask(sg));
@@ -5947,7 +5393,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5947{ 5393{
5948 struct sched_group *sg = sd->groups; 5394 struct sched_group *sg = sd->groups;
5949 5395
5950 WARN_ON(!sd || !sg); 5396 WARN_ON(!sg);
5951 5397
5952 do { 5398 do {
5953 sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 5399 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
@@ -6112,6 +5558,9 @@ static struct sched_domain_topology_level default_topology[] = {
6112 5558
6113static struct sched_domain_topology_level *sched_domain_topology = default_topology; 5559static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6114 5560
5561#define for_each_sd_topology(tl) \
5562 for (tl = sched_domain_topology; tl->init; tl++)
5563
6115#ifdef CONFIG_NUMA 5564#ifdef CONFIG_NUMA
6116 5565
6117static int sched_domains_numa_levels; 5566static int sched_domains_numa_levels;
@@ -6409,7 +5858,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6409 struct sched_domain_topology_level *tl; 5858 struct sched_domain_topology_level *tl;
6410 int j; 5859 int j;
6411 5860
6412 for (tl = sched_domain_topology; tl->init; tl++) { 5861 for_each_sd_topology(tl) {
6413 struct sd_data *sdd = &tl->data; 5862 struct sd_data *sdd = &tl->data;
6414 5863
6415 sdd->sd = alloc_percpu(struct sched_domain *); 5864 sdd->sd = alloc_percpu(struct sched_domain *);
@@ -6462,7 +5911,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
6462 struct sched_domain_topology_level *tl; 5911 struct sched_domain_topology_level *tl;
6463 int j; 5912 int j;
6464 5913
6465 for (tl = sched_domain_topology; tl->init; tl++) { 5914 for_each_sd_topology(tl) {
6466 struct sd_data *sdd = &tl->data; 5915 struct sd_data *sdd = &tl->data;
6467 5916
6468 for_each_cpu(j, cpu_map) { 5917 for_each_cpu(j, cpu_map) {
@@ -6490,9 +5939,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
6490} 5939}
6491 5940
6492struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 5941struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6493 struct s_data *d, const struct cpumask *cpu_map, 5942 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6494 struct sched_domain_attr *attr, struct sched_domain *child, 5943 struct sched_domain *child, int cpu)
6495 int cpu)
6496{ 5944{
6497 struct sched_domain *sd = tl->init(tl, cpu); 5945 struct sched_domain *sd = tl->init(tl, cpu);
6498 if (!sd) 5946 if (!sd)
@@ -6503,8 +5951,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6503 sd->level = child->level + 1; 5951 sd->level = child->level + 1;
6504 sched_domain_level_max = max(sched_domain_level_max, sd->level); 5952 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6505 child->parent = sd; 5953 child->parent = sd;
5954 sd->child = child;
6506 } 5955 }
6507 sd->child = child;
6508 set_domain_attribute(sd, attr); 5956 set_domain_attribute(sd, attr);
6509 5957
6510 return sd; 5958 return sd;
@@ -6517,7 +5965,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6517static int build_sched_domains(const struct cpumask *cpu_map, 5965static int build_sched_domains(const struct cpumask *cpu_map,
6518 struct sched_domain_attr *attr) 5966 struct sched_domain_attr *attr)
6519{ 5967{
6520 enum s_alloc alloc_state = sa_none; 5968 enum s_alloc alloc_state;
6521 struct sched_domain *sd; 5969 struct sched_domain *sd;
6522 struct s_data d; 5970 struct s_data d;
6523 int i, ret = -ENOMEM; 5971 int i, ret = -ENOMEM;
@@ -6531,18 +5979,15 @@ static int build_sched_domains(const struct cpumask *cpu_map,
6531 struct sched_domain_topology_level *tl; 5979 struct sched_domain_topology_level *tl;
6532 5980
6533 sd = NULL; 5981 sd = NULL;
6534 for (tl = sched_domain_topology; tl->init; tl++) { 5982 for_each_sd_topology(tl) {
6535 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); 5983 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
5984 if (tl == sched_domain_topology)
5985 *per_cpu_ptr(d.sd, i) = sd;
6536 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) 5986 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6537 sd->flags |= SD_OVERLAP; 5987 sd->flags |= SD_OVERLAP;
6538 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 5988 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6539 break; 5989 break;
6540 } 5990 }
6541
6542 while (sd->child)
6543 sd = sd->child;
6544
6545 *per_cpu_ptr(d.sd, i) = sd;
6546 } 5991 }
6547 5992
6548 /* Build the groups for the domains */ 5993 /* Build the groups for the domains */
@@ -6854,9 +6299,6 @@ void __init sched_init_smp(void)
6854 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 6299 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6855 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 6300 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6856 6301
6857 /* RT runtime code needs to handle some hotplug events */
6858 hotcpu_notifier(update_runtime, 0);
6859
6860 init_hrtick(); 6302 init_hrtick();
6861 6303
6862 /* Move init over to a non-isolated CPU */ 6304 /* Move init over to a non-isolated CPU */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index cc2dc3eea8a3..a7959e05a9d5 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -515,9 +515,8 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
515 515
516 for (;;) { 516 for (;;) {
517 /* Make sure "rtime" is the bigger of stime/rtime */ 517 /* Make sure "rtime" is the bigger of stime/rtime */
518 if (stime > rtime) { 518 if (stime > rtime)
519 u64 tmp = rtime; rtime = stime; stime = tmp; 519 swap(rtime, stime);
520 }
521 520
522 /* Make sure 'total' fits in 32 bits */ 521 /* Make sure 'total' fits in 32 bits */
523 if (total >> 32) 522 if (total >> 32)
@@ -747,17 +746,17 @@ void arch_vtime_task_switch(struct task_struct *prev)
747 746
748 write_seqlock(&current->vtime_seqlock); 747 write_seqlock(&current->vtime_seqlock);
749 current->vtime_snap_whence = VTIME_SYS; 748 current->vtime_snap_whence = VTIME_SYS;
750 current->vtime_snap = sched_clock(); 749 current->vtime_snap = sched_clock_cpu(smp_processor_id());
751 write_sequnlock(&current->vtime_seqlock); 750 write_sequnlock(&current->vtime_seqlock);
752} 751}
753 752
754void vtime_init_idle(struct task_struct *t) 753void vtime_init_idle(struct task_struct *t, int cpu)
755{ 754{
756 unsigned long flags; 755 unsigned long flags;
757 756
758 write_seqlock_irqsave(&t->vtime_seqlock, flags); 757 write_seqlock_irqsave(&t->vtime_seqlock, flags);
759 t->vtime_snap_whence = VTIME_SYS; 758 t->vtime_snap_whence = VTIME_SYS;
760 t->vtime_snap = sched_clock(); 759 t->vtime_snap = sched_clock_cpu(cpu);
761 write_sequnlock_irqrestore(&t->vtime_seqlock, flags); 760 write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
762} 761}
763 762
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 75024a673520..e076bddd4c66 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -209,22 +209,24 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
209 cfs_rq->nr_spread_over); 209 cfs_rq->nr_spread_over);
210 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); 210 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
211 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 211 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
212#ifdef CONFIG_FAIR_GROUP_SCHED
213#ifdef CONFIG_SMP 212#ifdef CONFIG_SMP
214 SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", 213 SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg",
215 cfs_rq->runnable_load_avg); 214 cfs_rq->runnable_load_avg);
216 SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", 215 SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
217 cfs_rq->blocked_load_avg); 216 cfs_rq->blocked_load_avg);
218 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg", 217#ifdef CONFIG_FAIR_GROUP_SCHED
219 (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg)); 218 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
220 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
221 cfs_rq->tg_load_contrib); 219 cfs_rq->tg_load_contrib);
222 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", 220 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
223 cfs_rq->tg_runnable_contrib); 221 cfs_rq->tg_runnable_contrib);
222 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
223 atomic_long_read(&cfs_rq->tg->load_avg));
224 SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", 224 SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
225 atomic_read(&cfs_rq->tg->runnable_avg)); 225 atomic_read(&cfs_rq->tg->runnable_avg));
226#endif 226#endif
227#endif
227 228
229#ifdef CONFIG_FAIR_GROUP_SCHED
228 print_cfs_group_stats(m, cpu, cfs_rq->tg); 230 print_cfs_group_stats(m, cpu, cfs_rq->tg);
229#endif 231#endif
230} 232}
@@ -493,15 +495,16 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
493 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, 495 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
494 get_nr_threads(p)); 496 get_nr_threads(p));
495 SEQ_printf(m, 497 SEQ_printf(m,
496 "---------------------------------------------------------\n"); 498 "---------------------------------------------------------"
499 "----------\n");
497#define __P(F) \ 500#define __P(F) \
498 SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) 501 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
499#define P(F) \ 502#define P(F) \
500 SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) 503 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
501#define __PN(F) \ 504#define __PN(F) \
502 SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) 505 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
503#define PN(F) \ 506#define PN(F) \
504 SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) 507 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
505 508
506 PN(se.exec_start); 509 PN(se.exec_start);
507 PN(se.vruntime); 510 PN(se.vruntime);
@@ -560,12 +563,18 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
560 } 563 }
561#endif 564#endif
562 __P(nr_switches); 565 __P(nr_switches);
563 SEQ_printf(m, "%-35s:%21Ld\n", 566 SEQ_printf(m, "%-45s:%21Ld\n",
564 "nr_voluntary_switches", (long long)p->nvcsw); 567 "nr_voluntary_switches", (long long)p->nvcsw);
565 SEQ_printf(m, "%-35s:%21Ld\n", 568 SEQ_printf(m, "%-45s:%21Ld\n",
566 "nr_involuntary_switches", (long long)p->nivcsw); 569 "nr_involuntary_switches", (long long)p->nivcsw);
567 570
568 P(se.load.weight); 571 P(se.load.weight);
572#ifdef CONFIG_SMP
573 P(se.avg.runnable_avg_sum);
574 P(se.avg.runnable_avg_period);
575 P(se.avg.load_avg_contrib);
576 P(se.avg.decay_count);
577#endif
569 P(policy); 578 P(policy);
570 P(prio); 579 P(prio);
571#undef PN 580#undef PN
@@ -579,7 +588,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
579 588
580 t0 = cpu_clock(this_cpu); 589 t0 = cpu_clock(this_cpu);
581 t1 = cpu_clock(this_cpu); 590 t1 = cpu_clock(this_cpu);
582 SEQ_printf(m, "%-35s:%21Ld\n", 591 SEQ_printf(m, "%-45s:%21Ld\n",
583 "clock-delta", (long long)(t1-t0)); 592 "clock-delta", (long long)(t1-t0));
584 } 593 }
585} 594}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c61a614465c8..f77f9c527449 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -113,6 +113,24 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
113unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; 113unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
114#endif 114#endif
115 115
116static inline void update_load_add(struct load_weight *lw, unsigned long inc)
117{
118 lw->weight += inc;
119 lw->inv_weight = 0;
120}
121
122static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
123{
124 lw->weight -= dec;
125 lw->inv_weight = 0;
126}
127
128static inline void update_load_set(struct load_weight *lw, unsigned long w)
129{
130 lw->weight = w;
131 lw->inv_weight = 0;
132}
133
116/* 134/*
117 * Increase the granularity value when there are more CPUs, 135 * Increase the granularity value when there are more CPUs,
118 * because with more CPUs the 'effective latency' as visible 136 * because with more CPUs the 'effective latency' as visible
@@ -662,6 +680,26 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
662 return calc_delta_fair(sched_slice(cfs_rq, se), se); 680 return calc_delta_fair(sched_slice(cfs_rq, se), se);
663} 681}
664 682
683#ifdef CONFIG_SMP
684static inline void __update_task_entity_contrib(struct sched_entity *se);
685
686/* Give new task start runnable values to heavy its load in infant time */
687void init_task_runnable_average(struct task_struct *p)
688{
689 u32 slice;
690
691 p->se.avg.decay_count = 0;
692 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
693 p->se.avg.runnable_avg_sum = slice;
694 p->se.avg.runnable_avg_period = slice;
695 __update_task_entity_contrib(&p->se);
696}
697#else
698void init_task_runnable_average(struct task_struct *p)
699{
700}
701#endif
702
665/* 703/*
666 * Update the current task's runtime statistics. Skip current tasks that 704 * Update the current task's runtime statistics. Skip current tasks that
667 * are not in our scheduling class. 705 * are not in our scheduling class.
@@ -686,7 +724,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
686static void update_curr(struct cfs_rq *cfs_rq) 724static void update_curr(struct cfs_rq *cfs_rq)
687{ 725{
688 struct sched_entity *curr = cfs_rq->curr; 726 struct sched_entity *curr = cfs_rq->curr;
689 u64 now = rq_of(cfs_rq)->clock_task; 727 u64 now = rq_clock_task(rq_of(cfs_rq));
690 unsigned long delta_exec; 728 unsigned long delta_exec;
691 729
692 if (unlikely(!curr)) 730 if (unlikely(!curr))
@@ -718,7 +756,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
718static inline void 756static inline void
719update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 757update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
720{ 758{
721 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); 759 schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
722} 760}
723 761
724/* 762/*
@@ -738,14 +776,14 @@ static void
738update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 776update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
739{ 777{
740 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, 778 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
741 rq_of(cfs_rq)->clock - se->statistics.wait_start)); 779 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
742 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); 780 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
743 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + 781 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
744 rq_of(cfs_rq)->clock - se->statistics.wait_start); 782 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
745#ifdef CONFIG_SCHEDSTATS 783#ifdef CONFIG_SCHEDSTATS
746 if (entity_is_task(se)) { 784 if (entity_is_task(se)) {
747 trace_sched_stat_wait(task_of(se), 785 trace_sched_stat_wait(task_of(se),
748 rq_of(cfs_rq)->clock - se->statistics.wait_start); 786 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
749 } 787 }
750#endif 788#endif
751 schedstat_set(se->statistics.wait_start, 0); 789 schedstat_set(se->statistics.wait_start, 0);
@@ -771,7 +809,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
771 /* 809 /*
772 * We are starting a new run period: 810 * We are starting a new run period:
773 */ 811 */
774 se->exec_start = rq_of(cfs_rq)->clock_task; 812 se->exec_start = rq_clock_task(rq_of(cfs_rq));
775} 813}
776 814
777/************************************************** 815/**************************************************
@@ -1037,7 +1075,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1037 * to gain a more accurate current total weight. See 1075 * to gain a more accurate current total weight. See
1038 * update_cfs_rq_load_contribution(). 1076 * update_cfs_rq_load_contribution().
1039 */ 1077 */
1040 tg_weight = atomic64_read(&tg->load_avg); 1078 tg_weight = atomic_long_read(&tg->load_avg);
1041 tg_weight -= cfs_rq->tg_load_contrib; 1079 tg_weight -= cfs_rq->tg_load_contrib;
1042 tg_weight += cfs_rq->load.weight; 1080 tg_weight += cfs_rq->load.weight;
1043 1081
@@ -1110,8 +1148,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
1110} 1148}
1111#endif /* CONFIG_FAIR_GROUP_SCHED */ 1149#endif /* CONFIG_FAIR_GROUP_SCHED */
1112 1150
1113/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ 1151#ifdef CONFIG_SMP
1114#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1115/* 1152/*
1116 * We choose a half-life close to 1 scheduling period. 1153 * We choose a half-life close to 1 scheduling period.
1117 * Note: The tables below are dependent on this value. 1154 * Note: The tables below are dependent on this value.
@@ -1319,13 +1356,13 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1319 int force_update) 1356 int force_update)
1320{ 1357{
1321 struct task_group *tg = cfs_rq->tg; 1358 struct task_group *tg = cfs_rq->tg;
1322 s64 tg_contrib; 1359 long tg_contrib;
1323 1360
1324 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; 1361 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1325 tg_contrib -= cfs_rq->tg_load_contrib; 1362 tg_contrib -= cfs_rq->tg_load_contrib;
1326 1363
1327 if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { 1364 if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1328 atomic64_add(tg_contrib, &tg->load_avg); 1365 atomic_long_add(tg_contrib, &tg->load_avg);
1329 cfs_rq->tg_load_contrib += tg_contrib; 1366 cfs_rq->tg_load_contrib += tg_contrib;
1330 } 1367 }
1331} 1368}
@@ -1360,8 +1397,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
1360 u64 contrib; 1397 u64 contrib;
1361 1398
1362 contrib = cfs_rq->tg_load_contrib * tg->shares; 1399 contrib = cfs_rq->tg_load_contrib * tg->shares;
1363 se->avg.load_avg_contrib = div64_u64(contrib, 1400 se->avg.load_avg_contrib = div_u64(contrib,
1364 atomic64_read(&tg->load_avg) + 1); 1401 atomic_long_read(&tg->load_avg) + 1);
1365 1402
1366 /* 1403 /*
1367 * For group entities we need to compute a correction term in the case 1404 * For group entities we need to compute a correction term in the case
@@ -1480,8 +1517,9 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1480 if (!decays && !force_update) 1517 if (!decays && !force_update)
1481 return; 1518 return;
1482 1519
1483 if (atomic64_read(&cfs_rq->removed_load)) { 1520 if (atomic_long_read(&cfs_rq->removed_load)) {
1484 u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); 1521 unsigned long removed_load;
1522 removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
1485 subtract_blocked_load_contrib(cfs_rq, removed_load); 1523 subtract_blocked_load_contrib(cfs_rq, removed_load);
1486 } 1524 }
1487 1525
@@ -1497,7 +1535,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1497 1535
1498static inline void update_rq_runnable_avg(struct rq *rq, int runnable) 1536static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1499{ 1537{
1500 __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); 1538 __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
1501 __update_tg_runnable_avg(&rq->avg, &rq->cfs); 1539 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
1502} 1540}
1503 1541
@@ -1510,9 +1548,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1510 * We track migrations using entity decay_count <= 0, on a wake-up 1548 * We track migrations using entity decay_count <= 0, on a wake-up
1511 * migration we use a negative decay count to track the remote decays 1549 * migration we use a negative decay count to track the remote decays
1512 * accumulated while sleeping. 1550 * accumulated while sleeping.
1551 *
1552 * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
1553 * are seen by enqueue_entity_load_avg() as a migration with an already
1554 * constructed load_avg_contrib.
1513 */ 1555 */
1514 if (unlikely(se->avg.decay_count <= 0)) { 1556 if (unlikely(se->avg.decay_count <= 0)) {
1515 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; 1557 se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
1516 if (se->avg.decay_count) { 1558 if (se->avg.decay_count) {
1517 /* 1559 /*
1518 * In a wake-up migration we have to approximate the 1560 * In a wake-up migration we have to approximate the
@@ -1530,7 +1572,13 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1530 } 1572 }
1531 wakeup = 0; 1573 wakeup = 0;
1532 } else { 1574 } else {
1533 __synchronize_entity_decay(se); 1575 /*
1576 * Task re-woke on same cpu (or else migrate_task_rq_fair()
1577 * would have made count negative); we must be careful to avoid
1578 * double-accounting blocked time after synchronizing decays.
1579 */
1580 se->avg.last_runnable_update += __synchronize_entity_decay(se)
1581 << 20;
1534 } 1582 }
1535 1583
1536 /* migrated tasks did not contribute to our blocked load */ 1584 /* migrated tasks did not contribute to our blocked load */
@@ -1607,7 +1655,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1607 tsk = task_of(se); 1655 tsk = task_of(se);
1608 1656
1609 if (se->statistics.sleep_start) { 1657 if (se->statistics.sleep_start) {
1610 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; 1658 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
1611 1659
1612 if ((s64)delta < 0) 1660 if ((s64)delta < 0)
1613 delta = 0; 1661 delta = 0;
@@ -1624,7 +1672,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1624 } 1672 }
1625 } 1673 }
1626 if (se->statistics.block_start) { 1674 if (se->statistics.block_start) {
1627 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; 1675 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
1628 1676
1629 if ((s64)delta < 0) 1677 if ((s64)delta < 0)
1630 delta = 0; 1678 delta = 0;
@@ -1712,7 +1760,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1712{ 1760{
1713 /* 1761 /*
1714 * Update the normalized vruntime before updating min_vruntime 1762 * Update the normalized vruntime before updating min_vruntime
1715 * through callig update_curr(). 1763 * through calling update_curr().
1716 */ 1764 */
1717 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) 1765 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
1718 se->vruntime += cfs_rq->min_vruntime; 1766 se->vruntime += cfs_rq->min_vruntime;
@@ -1805,9 +1853,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1805 struct task_struct *tsk = task_of(se); 1853 struct task_struct *tsk = task_of(se);
1806 1854
1807 if (tsk->state & TASK_INTERRUPTIBLE) 1855 if (tsk->state & TASK_INTERRUPTIBLE)
1808 se->statistics.sleep_start = rq_of(cfs_rq)->clock; 1856 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
1809 if (tsk->state & TASK_UNINTERRUPTIBLE) 1857 if (tsk->state & TASK_UNINTERRUPTIBLE)
1810 se->statistics.block_start = rq_of(cfs_rq)->clock; 1858 se->statistics.block_start = rq_clock(rq_of(cfs_rq));
1811 } 1859 }
1812#endif 1860#endif
1813 } 1861 }
@@ -2082,7 +2130,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2082 if (unlikely(cfs_rq->throttle_count)) 2130 if (unlikely(cfs_rq->throttle_count))
2083 return cfs_rq->throttled_clock_task; 2131 return cfs_rq->throttled_clock_task;
2084 2132
2085 return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; 2133 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
2086} 2134}
2087 2135
2088/* returns 0 on failure to allocate runtime */ 2136/* returns 0 on failure to allocate runtime */
@@ -2138,10 +2186,9 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2138static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) 2186static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2139{ 2187{
2140 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 2188 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2141 struct rq *rq = rq_of(cfs_rq);
2142 2189
2143 /* if the deadline is ahead of our clock, nothing to do */ 2190 /* if the deadline is ahead of our clock, nothing to do */
2144 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) 2191 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
2145 return; 2192 return;
2146 2193
2147 if (cfs_rq->runtime_remaining < 0) 2194 if (cfs_rq->runtime_remaining < 0)
@@ -2230,7 +2277,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
2230#ifdef CONFIG_SMP 2277#ifdef CONFIG_SMP
2231 if (!cfs_rq->throttle_count) { 2278 if (!cfs_rq->throttle_count) {
2232 /* adjust cfs_rq_clock_task() */ 2279 /* adjust cfs_rq_clock_task() */
2233 cfs_rq->throttled_clock_task_time += rq->clock_task - 2280 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
2234 cfs_rq->throttled_clock_task; 2281 cfs_rq->throttled_clock_task;
2235 } 2282 }
2236#endif 2283#endif
@@ -2245,7 +2292,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
2245 2292
2246 /* group is entering throttled state, stop time */ 2293 /* group is entering throttled state, stop time */
2247 if (!cfs_rq->throttle_count) 2294 if (!cfs_rq->throttle_count)
2248 cfs_rq->throttled_clock_task = rq->clock_task; 2295 cfs_rq->throttled_clock_task = rq_clock_task(rq);
2249 cfs_rq->throttle_count++; 2296 cfs_rq->throttle_count++;
2250 2297
2251 return 0; 2298 return 0;
@@ -2284,7 +2331,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2284 rq->nr_running -= task_delta; 2331 rq->nr_running -= task_delta;
2285 2332
2286 cfs_rq->throttled = 1; 2333 cfs_rq->throttled = 1;
2287 cfs_rq->throttled_clock = rq->clock; 2334 cfs_rq->throttled_clock = rq_clock(rq);
2288 raw_spin_lock(&cfs_b->lock); 2335 raw_spin_lock(&cfs_b->lock);
2289 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 2336 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
2290 raw_spin_unlock(&cfs_b->lock); 2337 raw_spin_unlock(&cfs_b->lock);
@@ -2298,15 +2345,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
2298 int enqueue = 1; 2345 int enqueue = 1;
2299 long task_delta; 2346 long task_delta;
2300 2347
2301 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; 2348 se = cfs_rq->tg->se[cpu_of(rq)];
2302 2349
2303 cfs_rq->throttled = 0; 2350 cfs_rq->throttled = 0;
2351
2352 update_rq_clock(rq);
2353
2304 raw_spin_lock(&cfs_b->lock); 2354 raw_spin_lock(&cfs_b->lock);
2305 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock; 2355 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
2306 list_del_rcu(&cfs_rq->throttled_list); 2356 list_del_rcu(&cfs_rq->throttled_list);
2307 raw_spin_unlock(&cfs_b->lock); 2357 raw_spin_unlock(&cfs_b->lock);
2308 2358
2309 update_rq_clock(rq);
2310 /* update hierarchical throttle state */ 2359 /* update hierarchical throttle state */
2311 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); 2360 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
2312 2361
@@ -2599,10 +2648,6 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2599 throttle_cfs_rq(cfs_rq); 2648 throttle_cfs_rq(cfs_rq);
2600} 2649}
2601 2650
2602static inline u64 default_cfs_period(void);
2603static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
2604static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
2605
2606static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) 2651static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
2607{ 2652{
2608 struct cfs_bandwidth *cfs_b = 2653 struct cfs_bandwidth *cfs_b =
@@ -2706,7 +2751,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
2706#else /* CONFIG_CFS_BANDWIDTH */ 2751#else /* CONFIG_CFS_BANDWIDTH */
2707static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) 2752static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2708{ 2753{
2709 return rq_of(cfs_rq)->clock_task; 2754 return rq_clock_task(rq_of(cfs_rq));
2710} 2755}
2711 2756
2712static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 2757static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
@@ -2919,7 +2964,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2919/* Used instead of source_load when we know the type == 0 */ 2964/* Used instead of source_load when we know the type == 0 */
2920static unsigned long weighted_cpuload(const int cpu) 2965static unsigned long weighted_cpuload(const int cpu)
2921{ 2966{
2922 return cpu_rq(cpu)->load.weight; 2967 return cpu_rq(cpu)->cfs.runnable_load_avg;
2923} 2968}
2924 2969
2925/* 2970/*
@@ -2964,9 +3009,10 @@ static unsigned long cpu_avg_load_per_task(int cpu)
2964{ 3009{
2965 struct rq *rq = cpu_rq(cpu); 3010 struct rq *rq = cpu_rq(cpu);
2966 unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 3011 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
3012 unsigned long load_avg = rq->cfs.runnable_load_avg;
2967 3013
2968 if (nr_running) 3014 if (nr_running)
2969 return rq->load.weight / nr_running; 3015 return load_avg / nr_running;
2970 3016
2971 return 0; 3017 return 0;
2972} 3018}
@@ -3416,12 +3462,6 @@ unlock:
3416} 3462}
3417 3463
3418/* 3464/*
3419 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
3420 * removed when useful for applications beyond shares distribution (e.g.
3421 * load-balance).
3422 */
3423#ifdef CONFIG_FAIR_GROUP_SCHED
3424/*
3425 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 3465 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
3426 * cfs_rq_of(p) references at time of call are still valid and identify the 3466 * cfs_rq_of(p) references at time of call are still valid and identify the
3427 * previous cpu. However, the caller only guarantees p->pi_lock is held; no 3467 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
@@ -3441,10 +3481,10 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3441 */ 3481 */
3442 if (se->avg.decay_count) { 3482 if (se->avg.decay_count) {
3443 se->avg.decay_count = -__synchronize_entity_decay(se); 3483 se->avg.decay_count = -__synchronize_entity_decay(se);
3444 atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); 3484 atomic_long_add(se->avg.load_avg_contrib,
3485 &cfs_rq->removed_load);
3445 } 3486 }
3446} 3487}
3447#endif
3448#endif /* CONFIG_SMP */ 3488#endif /* CONFIG_SMP */
3449 3489
3450static unsigned long 3490static unsigned long
@@ -3946,7 +3986,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3946 * 2) too many balance attempts have failed. 3986 * 2) too many balance attempts have failed.
3947 */ 3987 */
3948 3988
3949 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); 3989 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
3950 if (!tsk_cache_hot || 3990 if (!tsk_cache_hot ||
3951 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 3991 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3952 3992
@@ -4141,11 +4181,11 @@ static int tg_load_down(struct task_group *tg, void *data)
4141 long cpu = (long)data; 4181 long cpu = (long)data;
4142 4182
4143 if (!tg->parent) { 4183 if (!tg->parent) {
4144 load = cpu_rq(cpu)->load.weight; 4184 load = cpu_rq(cpu)->avg.load_avg_contrib;
4145 } else { 4185 } else {
4146 load = tg->parent->cfs_rq[cpu]->h_load; 4186 load = tg->parent->cfs_rq[cpu]->h_load;
4147 load *= tg->se[cpu]->load.weight; 4187 load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
4148 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 4188 tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
4149 } 4189 }
4150 4190
4151 tg->cfs_rq[cpu]->h_load = load; 4191 tg->cfs_rq[cpu]->h_load = load;
@@ -4171,12 +4211,9 @@ static void update_h_load(long cpu)
4171static unsigned long task_h_load(struct task_struct *p) 4211static unsigned long task_h_load(struct task_struct *p)
4172{ 4212{
4173 struct cfs_rq *cfs_rq = task_cfs_rq(p); 4213 struct cfs_rq *cfs_rq = task_cfs_rq(p);
4174 unsigned long load;
4175
4176 load = p->se.load.weight;
4177 load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
4178 4214
4179 return load; 4215 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
4216 cfs_rq->runnable_load_avg + 1);
4180} 4217}
4181#else 4218#else
4182static inline void update_blocked_averages(int cpu) 4219static inline void update_blocked_averages(int cpu)
@@ -4189,7 +4226,7 @@ static inline void update_h_load(long cpu)
4189 4226
4190static unsigned long task_h_load(struct task_struct *p) 4227static unsigned long task_h_load(struct task_struct *p)
4191{ 4228{
4192 return p->se.load.weight; 4229 return p->se.avg.load_avg_contrib;
4193} 4230}
4194#endif 4231#endif
4195 4232
@@ -4302,7 +4339,7 @@ static unsigned long scale_rt_power(int cpu)
4302 age_stamp = ACCESS_ONCE(rq->age_stamp); 4339 age_stamp = ACCESS_ONCE(rq->age_stamp);
4303 avg = ACCESS_ONCE(rq->rt_avg); 4340 avg = ACCESS_ONCE(rq->rt_avg);
4304 4341
4305 total = sched_avg_period() + (rq->clock - age_stamp); 4342 total = sched_avg_period() + (rq_clock(rq) - age_stamp);
4306 4343
4307 if (unlikely(total < avg)) { 4344 if (unlikely(total < avg)) {
4308 /* Ensures that power won't end up being negative */ 4345 /* Ensures that power won't end up being negative */
@@ -5241,7 +5278,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5241 int pulled_task = 0; 5278 int pulled_task = 0;
5242 unsigned long next_balance = jiffies + HZ; 5279 unsigned long next_balance = jiffies + HZ;
5243 5280
5244 this_rq->idle_stamp = this_rq->clock; 5281 this_rq->idle_stamp = rq_clock(this_rq);
5245 5282
5246 if (this_rq->avg_idle < sysctl_sched_migration_cost) 5283 if (this_rq->avg_idle < sysctl_sched_migration_cost)
5247 return; 5284 return;
@@ -5418,10 +5455,9 @@ static inline void nohz_balance_exit_idle(int cpu)
5418static inline void set_cpu_sd_state_busy(void) 5455static inline void set_cpu_sd_state_busy(void)
5419{ 5456{
5420 struct sched_domain *sd; 5457 struct sched_domain *sd;
5421 int cpu = smp_processor_id();
5422 5458
5423 rcu_read_lock(); 5459 rcu_read_lock();
5424 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); 5460 sd = rcu_dereference_check_sched_domain(this_rq()->sd);
5425 5461
5426 if (!sd || !sd->nohz_idle) 5462 if (!sd || !sd->nohz_idle)
5427 goto unlock; 5463 goto unlock;
@@ -5436,10 +5472,9 @@ unlock:
5436void set_cpu_sd_state_idle(void) 5472void set_cpu_sd_state_idle(void)
5437{ 5473{
5438 struct sched_domain *sd; 5474 struct sched_domain *sd;
5439 int cpu = smp_processor_id();
5440 5475
5441 rcu_read_lock(); 5476 rcu_read_lock();
5442 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); 5477 sd = rcu_dereference_check_sched_domain(this_rq()->sd);
5443 5478
5444 if (!sd || sd->nohz_idle) 5479 if (!sd || sd->nohz_idle)
5445 goto unlock; 5480 goto unlock;
@@ -5848,7 +5883,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
5848 se->vruntime -= cfs_rq->min_vruntime; 5883 se->vruntime -= cfs_rq->min_vruntime;
5849 } 5884 }
5850 5885
5851#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) 5886#ifdef CONFIG_SMP
5852 /* 5887 /*
5853 * Remove our load from contribution when we leave sched_fair 5888 * Remove our load from contribution when we leave sched_fair
5854 * and ensure we don't carry in an old decay_count if we 5889 * and ensure we don't carry in an old decay_count if we
@@ -5907,9 +5942,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
5907#ifndef CONFIG_64BIT 5942#ifndef CONFIG_64BIT
5908 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 5943 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5909#endif 5944#endif
5910#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) 5945#ifdef CONFIG_SMP
5911 atomic64_set(&cfs_rq->decay_counter, 1); 5946 atomic64_set(&cfs_rq->decay_counter, 1);
5912 atomic64_set(&cfs_rq->removed_load, 0); 5947 atomic_long_set(&cfs_rq->removed_load, 0);
5913#endif 5948#endif
5914} 5949}
5915 5950
@@ -6091,6 +6126,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
6091 se = tg->se[i]; 6126 se = tg->se[i];
6092 /* Propagate contribution to hierarchy */ 6127 /* Propagate contribution to hierarchy */
6093 raw_spin_lock_irqsave(&rq->lock, flags); 6128 raw_spin_lock_irqsave(&rq->lock, flags);
6129
6130 /* Possible calls to update_curr() need rq clock */
6131 update_rq_clock(rq);
6094 for_each_sched_entity(se) 6132 for_each_sched_entity(se)
6095 update_cfs_shares(group_cfs_rq(se)); 6133 update_cfs_shares(group_cfs_rq(se));
6096 raw_spin_unlock_irqrestore(&rq->lock, flags); 6134 raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -6146,9 +6184,8 @@ const struct sched_class fair_sched_class = {
6146 6184
6147#ifdef CONFIG_SMP 6185#ifdef CONFIG_SMP
6148 .select_task_rq = select_task_rq_fair, 6186 .select_task_rq = select_task_rq_fair,
6149#ifdef CONFIG_FAIR_GROUP_SCHED
6150 .migrate_task_rq = migrate_task_rq_fair, 6187 .migrate_task_rq = migrate_task_rq_fair,
6151#endif 6188
6152 .rq_online = rq_online_fair, 6189 .rq_online = rq_online_fair,
6153 .rq_offline = rq_offline_fair, 6190 .rq_offline = rq_offline_fair,
6154 6191
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
new file mode 100644
index 000000000000..16f5a30f9c88
--- /dev/null
+++ b/kernel/sched/proc.c
@@ -0,0 +1,591 @@
1/*
2 * kernel/sched/proc.c
3 *
4 * Kernel load calculations, forked from sched/core.c
5 */
6
7#include <linux/export.h>
8
9#include "sched.h"
10
11unsigned long this_cpu_load(void)
12{
13 struct rq *this = this_rq();
14 return this->cpu_load[0];
15}
16
17
18/*
19 * Global load-average calculations
20 *
21 * We take a distributed and async approach to calculating the global load-avg
22 * in order to minimize overhead.
23 *
24 * The global load average is an exponentially decaying average of nr_running +
25 * nr_uninterruptible.
26 *
27 * Once every LOAD_FREQ:
28 *
29 * nr_active = 0;
30 * for_each_possible_cpu(cpu)
31 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
32 *
33 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
34 *
35 * Due to a number of reasons the above turns in the mess below:
36 *
37 * - for_each_possible_cpu() is prohibitively expensive on machines with
38 * serious number of cpus, therefore we need to take a distributed approach
39 * to calculating nr_active.
40 *
41 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
42 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
43 *
44 * So assuming nr_active := 0 when we start out -- true per definition, we
45 * can simply take per-cpu deltas and fold those into a global accumulate
46 * to obtain the same result. See calc_load_fold_active().
47 *
48 * Furthermore, in order to avoid synchronizing all per-cpu delta folding
49 * across the machine, we assume 10 ticks is sufficient time for every
50 * cpu to have completed this task.
51 *
52 * This places an upper-bound on the IRQ-off latency of the machine. Then
53 * again, being late doesn't loose the delta, just wrecks the sample.
54 *
55 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
56 * this would add another cross-cpu cacheline miss and atomic operation
57 * to the wakeup path. Instead we increment on whatever cpu the task ran
58 * when it went into uninterruptible state and decrement on whatever cpu
59 * did the wakeup. This means that only the sum of nr_uninterruptible over
60 * all cpus yields the correct result.
61 *
62 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
63 */
64
65/* Variables and functions for calc_load */
66atomic_long_t calc_load_tasks;
67unsigned long calc_load_update;
68unsigned long avenrun[3];
69EXPORT_SYMBOL(avenrun); /* should be removed */
70
71/**
72 * get_avenrun - get the load average array
73 * @loads: pointer to dest load array
74 * @offset: offset to add
75 * @shift: shift count to shift the result left
76 *
77 * These values are estimates at best, so no need for locking.
78 */
79void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
80{
81 loads[0] = (avenrun[0] + offset) << shift;
82 loads[1] = (avenrun[1] + offset) << shift;
83 loads[2] = (avenrun[2] + offset) << shift;
84}
85
86long calc_load_fold_active(struct rq *this_rq)
87{
88 long nr_active, delta = 0;
89
90 nr_active = this_rq->nr_running;
91 nr_active += (long) this_rq->nr_uninterruptible;
92
93 if (nr_active != this_rq->calc_load_active) {
94 delta = nr_active - this_rq->calc_load_active;
95 this_rq->calc_load_active = nr_active;
96 }
97
98 return delta;
99}
100
101/*
102 * a1 = a0 * e + a * (1 - e)
103 */
104static unsigned long
105calc_load(unsigned long load, unsigned long exp, unsigned long active)
106{
107 load *= exp;
108 load += active * (FIXED_1 - exp);
109 load += 1UL << (FSHIFT - 1);
110 return load >> FSHIFT;
111}
112
113#ifdef CONFIG_NO_HZ_COMMON
114/*
115 * Handle NO_HZ for the global load-average.
116 *
117 * Since the above described distributed algorithm to compute the global
118 * load-average relies on per-cpu sampling from the tick, it is affected by
119 * NO_HZ.
120 *
121 * The basic idea is to fold the nr_active delta into a global idle-delta upon
122 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
123 * when we read the global state.
124 *
125 * Obviously reality has to ruin such a delightfully simple scheme:
126 *
127 * - When we go NO_HZ idle during the window, we can negate our sample
128 * contribution, causing under-accounting.
129 *
130 * We avoid this by keeping two idle-delta counters and flipping them
131 * when the window starts, thus separating old and new NO_HZ load.
132 *
133 * The only trick is the slight shift in index flip for read vs write.
134 *
135 * 0s 5s 10s 15s
136 * +10 +10 +10 +10
137 * |-|-----------|-|-----------|-|-----------|-|
138 * r:0 0 1 1 0 0 1 1 0
139 * w:0 1 1 0 0 1 1 0 0
140 *
141 * This ensures we'll fold the old idle contribution in this window while
142 * accumlating the new one.
143 *
144 * - When we wake up from NO_HZ idle during the window, we push up our
145 * contribution, since we effectively move our sample point to a known
146 * busy state.
147 *
148 * This is solved by pushing the window forward, and thus skipping the
149 * sample, for this cpu (effectively using the idle-delta for this cpu which
150 * was in effect at the time the window opened). This also solves the issue
151 * of having to deal with a cpu having been in NOHZ idle for multiple
152 * LOAD_FREQ intervals.
153 *
154 * When making the ILB scale, we should try to pull this in as well.
155 */
156static atomic_long_t calc_load_idle[2];
157static int calc_load_idx;
158
159static inline int calc_load_write_idx(void)
160{
161 int idx = calc_load_idx;
162
163 /*
164 * See calc_global_nohz(), if we observe the new index, we also
165 * need to observe the new update time.
166 */
167 smp_rmb();
168
169 /*
170 * If the folding window started, make sure we start writing in the
171 * next idle-delta.
172 */
173 if (!time_before(jiffies, calc_load_update))
174 idx++;
175
176 return idx & 1;
177}
178
179static inline int calc_load_read_idx(void)
180{
181 return calc_load_idx & 1;
182}
183
184void calc_load_enter_idle(void)
185{
186 struct rq *this_rq = this_rq();
187 long delta;
188
189 /*
190 * We're going into NOHZ mode, if there's any pending delta, fold it
191 * into the pending idle delta.
192 */
193 delta = calc_load_fold_active(this_rq);
194 if (delta) {
195 int idx = calc_load_write_idx();
196 atomic_long_add(delta, &calc_load_idle[idx]);
197 }
198}
199
200void calc_load_exit_idle(void)
201{
202 struct rq *this_rq = this_rq();
203
204 /*
205 * If we're still before the sample window, we're done.
206 */
207 if (time_before(jiffies, this_rq->calc_load_update))
208 return;
209
210 /*
211 * We woke inside or after the sample window, this means we're already
212 * accounted through the nohz accounting, so skip the entire deal and
213 * sync up for the next window.
214 */
215 this_rq->calc_load_update = calc_load_update;
216 if (time_before(jiffies, this_rq->calc_load_update + 10))
217 this_rq->calc_load_update += LOAD_FREQ;
218}
219
220static long calc_load_fold_idle(void)
221{
222 int idx = calc_load_read_idx();
223 long delta = 0;
224
225 if (atomic_long_read(&calc_load_idle[idx]))
226 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
227
228 return delta;
229}
230
231/**
232 * fixed_power_int - compute: x^n, in O(log n) time
233 *
234 * @x: base of the power
235 * @frac_bits: fractional bits of @x
236 * @n: power to raise @x to.
237 *
238 * By exploiting the relation between the definition of the natural power
239 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
240 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
241 * (where: n_i \elem {0, 1}, the binary vector representing n),
242 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
243 * of course trivially computable in O(log_2 n), the length of our binary
244 * vector.
245 */
246static unsigned long
247fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
248{
249 unsigned long result = 1UL << frac_bits;
250
251 if (n) for (;;) {
252 if (n & 1) {
253 result *= x;
254 result += 1UL << (frac_bits - 1);
255 result >>= frac_bits;
256 }
257 n >>= 1;
258 if (!n)
259 break;
260 x *= x;
261 x += 1UL << (frac_bits - 1);
262 x >>= frac_bits;
263 }
264
265 return result;
266}
267
268/*
269 * a1 = a0 * e + a * (1 - e)
270 *
271 * a2 = a1 * e + a * (1 - e)
272 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
273 * = a0 * e^2 + a * (1 - e) * (1 + e)
274 *
275 * a3 = a2 * e + a * (1 - e)
276 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
277 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
278 *
279 * ...
280 *
281 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
282 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
283 * = a0 * e^n + a * (1 - e^n)
284 *
285 * [1] application of the geometric series:
286 *
287 * n 1 - x^(n+1)
288 * S_n := \Sum x^i = -------------
289 * i=0 1 - x
290 */
291static unsigned long
292calc_load_n(unsigned long load, unsigned long exp,
293 unsigned long active, unsigned int n)
294{
295
296 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
297}
298
299/*
300 * NO_HZ can leave us missing all per-cpu ticks calling
301 * calc_load_account_active(), but since an idle CPU folds its delta into
302 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
303 * in the pending idle delta if our idle period crossed a load cycle boundary.
304 *
305 * Once we've updated the global active value, we need to apply the exponential
306 * weights adjusted to the number of cycles missed.
307 */
308static void calc_global_nohz(void)
309{
310 long delta, active, n;
311
312 if (!time_before(jiffies, calc_load_update + 10)) {
313 /*
314 * Catch-up, fold however many we are behind still
315 */
316 delta = jiffies - calc_load_update - 10;
317 n = 1 + (delta / LOAD_FREQ);
318
319 active = atomic_long_read(&calc_load_tasks);
320 active = active > 0 ? active * FIXED_1 : 0;
321
322 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
323 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
324 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
325
326 calc_load_update += n * LOAD_FREQ;
327 }
328
329 /*
330 * Flip the idle index...
331 *
332 * Make sure we first write the new time then flip the index, so that
333 * calc_load_write_idx() will see the new time when it reads the new
334 * index, this avoids a double flip messing things up.
335 */
336 smp_wmb();
337 calc_load_idx++;
338}
339#else /* !CONFIG_NO_HZ_COMMON */
340
341static inline long calc_load_fold_idle(void) { return 0; }
342static inline void calc_global_nohz(void) { }
343
344#endif /* CONFIG_NO_HZ_COMMON */
345
346/*
347 * calc_load - update the avenrun load estimates 10 ticks after the
348 * CPUs have updated calc_load_tasks.
349 */
350void calc_global_load(unsigned long ticks)
351{
352 long active, delta;
353
354 if (time_before(jiffies, calc_load_update + 10))
355 return;
356
357 /*
358 * Fold the 'old' idle-delta to include all NO_HZ cpus.
359 */
360 delta = calc_load_fold_idle();
361 if (delta)
362 atomic_long_add(delta, &calc_load_tasks);
363
364 active = atomic_long_read(&calc_load_tasks);
365 active = active > 0 ? active * FIXED_1 : 0;
366
367 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
368 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
369 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
370
371 calc_load_update += LOAD_FREQ;
372
373 /*
374 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
375 */
376 calc_global_nohz();
377}
378
379/*
380 * Called from update_cpu_load() to periodically update this CPU's
381 * active count.
382 */
383static void calc_load_account_active(struct rq *this_rq)
384{
385 long delta;
386
387 if (time_before(jiffies, this_rq->calc_load_update))
388 return;
389
390 delta = calc_load_fold_active(this_rq);
391 if (delta)
392 atomic_long_add(delta, &calc_load_tasks);
393
394 this_rq->calc_load_update += LOAD_FREQ;
395}
396
397/*
398 * End of global load-average stuff
399 */
400
401/*
402 * The exact cpuload at various idx values, calculated at every tick would be
403 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
404 *
405 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
406 * on nth tick when cpu may be busy, then we have:
407 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
408 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
409 *
410 * decay_load_missed() below does efficient calculation of
411 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
412 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
413 *
414 * The calculation is approximated on a 128 point scale.
415 * degrade_zero_ticks is the number of ticks after which load at any
416 * particular idx is approximated to be zero.
417 * degrade_factor is a precomputed table, a row for each load idx.
418 * Each column corresponds to degradation factor for a power of two ticks,
419 * based on 128 point scale.
420 * Example:
421 * row 2, col 3 (=12) says that the degradation at load idx 2 after
422 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
423 *
424 * With this power of 2 load factors, we can degrade the load n times
425 * by looking at 1 bits in n and doing as many mult/shift instead of
426 * n mult/shifts needed by the exact degradation.
427 */
428#define DEGRADE_SHIFT 7
429static const unsigned char
430 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
431static const unsigned char
432 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
433 {0, 0, 0, 0, 0, 0, 0, 0},
434 {64, 32, 8, 0, 0, 0, 0, 0},
435 {96, 72, 40, 12, 1, 0, 0},
436 {112, 98, 75, 43, 15, 1, 0},
437 {120, 112, 98, 76, 45, 16, 2} };
438
439/*
440 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
441 * would be when CPU is idle and so we just decay the old load without
442 * adding any new load.
443 */
444static unsigned long
445decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
446{
447 int j = 0;
448
449 if (!missed_updates)
450 return load;
451
452 if (missed_updates >= degrade_zero_ticks[idx])
453 return 0;
454
455 if (idx == 1)
456 return load >> missed_updates;
457
458 while (missed_updates) {
459 if (missed_updates % 2)
460 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
461
462 missed_updates >>= 1;
463 j++;
464 }
465 return load;
466}
467
468/*
469 * Update rq->cpu_load[] statistics. This function is usually called every
470 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
471 * every tick. We fix it up based on jiffies.
472 */
473static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
474 unsigned long pending_updates)
475{
476 int i, scale;
477
478 this_rq->nr_load_updates++;
479
480 /* Update our load: */
481 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
482 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
483 unsigned long old_load, new_load;
484
485 /* scale is effectively 1 << i now, and >> i divides by scale */
486
487 old_load = this_rq->cpu_load[i];
488 old_load = decay_load_missed(old_load, pending_updates - 1, i);
489 new_load = this_load;
490 /*
491 * Round up the averaging division if load is increasing. This
492 * prevents us from getting stuck on 9 if the load is 10, for
493 * example.
494 */
495 if (new_load > old_load)
496 new_load += scale - 1;
497
498 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
499 }
500
501 sched_avg_update(this_rq);
502}
503
504#ifdef CONFIG_SMP
505static inline unsigned long get_rq_runnable_load(struct rq *rq)
506{
507 return rq->cfs.runnable_load_avg;
508}
509#else
510static inline unsigned long get_rq_runnable_load(struct rq *rq)
511{
512 return rq->load.weight;
513}
514#endif
515
516#ifdef CONFIG_NO_HZ_COMMON
517/*
518 * There is no sane way to deal with nohz on smp when using jiffies because the
519 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
520 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
521 *
522 * Therefore we cannot use the delta approach from the regular tick since that
523 * would seriously skew the load calculation. However we'll make do for those
524 * updates happening while idle (nohz_idle_balance) or coming out of idle
525 * (tick_nohz_idle_exit).
526 *
527 * This means we might still be one tick off for nohz periods.
528 */
529
530/*
531 * Called from nohz_idle_balance() to update the load ratings before doing the
532 * idle balance.
533 */
534void update_idle_cpu_load(struct rq *this_rq)
535{
536 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
537 unsigned long load = get_rq_runnable_load(this_rq);
538 unsigned long pending_updates;
539
540 /*
541 * bail if there's load or we're actually up-to-date.
542 */
543 if (load || curr_jiffies == this_rq->last_load_update_tick)
544 return;
545
546 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
547 this_rq->last_load_update_tick = curr_jiffies;
548
549 __update_cpu_load(this_rq, load, pending_updates);
550}
551
552/*
553 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
554 */
555void update_cpu_load_nohz(void)
556{
557 struct rq *this_rq = this_rq();
558 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
559 unsigned long pending_updates;
560
561 if (curr_jiffies == this_rq->last_load_update_tick)
562 return;
563
564 raw_spin_lock(&this_rq->lock);
565 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
566 if (pending_updates) {
567 this_rq->last_load_update_tick = curr_jiffies;
568 /*
569 * We were idle, this means load 0, the current load might be
570 * !0 due to remote wakeups and the sort.
571 */
572 __update_cpu_load(this_rq, 0, pending_updates);
573 }
574 raw_spin_unlock(&this_rq->lock);
575}
576#endif /* CONFIG_NO_HZ */
577
578/*
579 * Called from scheduler_tick()
580 */
581void update_cpu_load_active(struct rq *this_rq)
582{
583 unsigned long load = get_rq_runnable_load(this_rq);
584 /*
585 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
586 */
587 this_rq->last_load_update_tick = jiffies;
588 __update_cpu_load(this_rq, load, 1);
589
590 calc_load_account_active(this_rq);
591}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 127a2c4cf4ab..01970c8e64df 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -399,20 +399,6 @@ static inline struct task_group *next_task_group(struct task_group *tg)
399 (iter = next_task_group(iter)) && \ 399 (iter = next_task_group(iter)) && \
400 (rt_rq = iter->rt_rq[cpu_of(rq)]);) 400 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
401 401
402static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
403{
404 list_add_rcu(&rt_rq->leaf_rt_rq_list,
405 &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
406}
407
408static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
409{
410 list_del_rcu(&rt_rq->leaf_rt_rq_list);
411}
412
413#define for_each_leaf_rt_rq(rt_rq, rq) \
414 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
415
416#define for_each_sched_rt_entity(rt_se) \ 402#define for_each_sched_rt_entity(rt_se) \
417 for (; rt_se; rt_se = rt_se->parent) 403 for (; rt_se; rt_se = rt_se->parent)
418 404
@@ -472,7 +458,7 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
472#ifdef CONFIG_SMP 458#ifdef CONFIG_SMP
473static inline const struct cpumask *sched_rt_period_mask(void) 459static inline const struct cpumask *sched_rt_period_mask(void)
474{ 460{
475 return cpu_rq(smp_processor_id())->rd->span; 461 return this_rq()->rd->span;
476} 462}
477#else 463#else
478static inline const struct cpumask *sched_rt_period_mask(void) 464static inline const struct cpumask *sched_rt_period_mask(void)
@@ -509,17 +495,6 @@ typedef struct rt_rq *rt_rq_iter_t;
509#define for_each_rt_rq(rt_rq, iter, rq) \ 495#define for_each_rt_rq(rt_rq, iter, rq) \
510 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 496 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
511 497
512static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
513{
514}
515
516static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
517{
518}
519
520#define for_each_leaf_rt_rq(rt_rq, rq) \
521 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
522
523#define for_each_sched_rt_entity(rt_se) \ 498#define for_each_sched_rt_entity(rt_se) \
524 for (; rt_se; rt_se = NULL) 499 for (; rt_se; rt_se = NULL)
525 500
@@ -699,15 +674,6 @@ balanced:
699 } 674 }
700} 675}
701 676
702static void disable_runtime(struct rq *rq)
703{
704 unsigned long flags;
705
706 raw_spin_lock_irqsave(&rq->lock, flags);
707 __disable_runtime(rq);
708 raw_spin_unlock_irqrestore(&rq->lock, flags);
709}
710
711static void __enable_runtime(struct rq *rq) 677static void __enable_runtime(struct rq *rq)
712{ 678{
713 rt_rq_iter_t iter; 679 rt_rq_iter_t iter;
@@ -732,37 +698,6 @@ static void __enable_runtime(struct rq *rq)
732 } 698 }
733} 699}
734 700
735static void enable_runtime(struct rq *rq)
736{
737 unsigned long flags;
738
739 raw_spin_lock_irqsave(&rq->lock, flags);
740 __enable_runtime(rq);
741 raw_spin_unlock_irqrestore(&rq->lock, flags);
742}
743
744int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
745{
746 int cpu = (int)(long)hcpu;
747
748 switch (action) {
749 case CPU_DOWN_PREPARE:
750 case CPU_DOWN_PREPARE_FROZEN:
751 disable_runtime(cpu_rq(cpu));
752 return NOTIFY_OK;
753
754 case CPU_DOWN_FAILED:
755 case CPU_DOWN_FAILED_FROZEN:
756 case CPU_ONLINE:
757 case CPU_ONLINE_FROZEN:
758 enable_runtime(cpu_rq(cpu));
759 return NOTIFY_OK;
760
761 default:
762 return NOTIFY_DONE;
763 }
764}
765
766static int balance_runtime(struct rt_rq *rt_rq) 701static int balance_runtime(struct rt_rq *rt_rq)
767{ 702{
768 int more = 0; 703 int more = 0;
@@ -926,7 +861,7 @@ static void update_curr_rt(struct rq *rq)
926 if (curr->sched_class != &rt_sched_class) 861 if (curr->sched_class != &rt_sched_class)
927 return; 862 return;
928 863
929 delta_exec = rq->clock_task - curr->se.exec_start; 864 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
930 if (unlikely((s64)delta_exec <= 0)) 865 if (unlikely((s64)delta_exec <= 0))
931 return; 866 return;
932 867
@@ -936,7 +871,7 @@ static void update_curr_rt(struct rq *rq)
936 curr->se.sum_exec_runtime += delta_exec; 871 curr->se.sum_exec_runtime += delta_exec;
937 account_group_exec_runtime(curr, delta_exec); 872 account_group_exec_runtime(curr, delta_exec);
938 873
939 curr->se.exec_start = rq->clock_task; 874 curr->se.exec_start = rq_clock_task(rq);
940 cpuacct_charge(curr, delta_exec); 875 cpuacct_charge(curr, delta_exec);
941 876
942 sched_rt_avg_update(rq, delta_exec); 877 sched_rt_avg_update(rq, delta_exec);
@@ -1106,9 +1041,6 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1106 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 1041 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
1107 return; 1042 return;
1108 1043
1109 if (!rt_rq->rt_nr_running)
1110 list_add_leaf_rt_rq(rt_rq);
1111
1112 if (head) 1044 if (head)
1113 list_add(&rt_se->run_list, queue); 1045 list_add(&rt_se->run_list, queue);
1114 else 1046 else
@@ -1128,8 +1060,6 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
1128 __clear_bit(rt_se_prio(rt_se), array->bitmap); 1060 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1129 1061
1130 dec_rt_tasks(rt_se, rt_rq); 1062 dec_rt_tasks(rt_se, rt_rq);
1131 if (!rt_rq->rt_nr_running)
1132 list_del_leaf_rt_rq(rt_rq);
1133} 1063}
1134 1064
1135/* 1065/*
@@ -1385,7 +1315,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1385 } while (rt_rq); 1315 } while (rt_rq);
1386 1316
1387 p = rt_task_of(rt_se); 1317 p = rt_task_of(rt_se);
1388 p->se.exec_start = rq->clock_task; 1318 p->se.exec_start = rq_clock_task(rq);
1389 1319
1390 return p; 1320 return p;
1391} 1321}
@@ -1434,42 +1364,24 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1434 return 0; 1364 return 0;
1435} 1365}
1436 1366
1437/* Return the second highest RT task, NULL otherwise */ 1367/*
1438static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) 1368 * Return the highest pushable rq's task, which is suitable to be executed
1369 * on the cpu, NULL otherwise
1370 */
1371static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1439{ 1372{
1440 struct task_struct *next = NULL; 1373 struct plist_head *head = &rq->rt.pushable_tasks;
1441 struct sched_rt_entity *rt_se; 1374 struct task_struct *p;
1442 struct rt_prio_array *array;
1443 struct rt_rq *rt_rq;
1444 int idx;
1445
1446 for_each_leaf_rt_rq(rt_rq, rq) {
1447 array = &rt_rq->active;
1448 idx = sched_find_first_bit(array->bitmap);
1449next_idx:
1450 if (idx >= MAX_RT_PRIO)
1451 continue;
1452 if (next && next->prio <= idx)
1453 continue;
1454 list_for_each_entry(rt_se, array->queue + idx, run_list) {
1455 struct task_struct *p;
1456 1375
1457 if (!rt_entity_is_task(rt_se)) 1376 if (!has_pushable_tasks(rq))
1458 continue; 1377 return NULL;
1459 1378
1460 p = rt_task_of(rt_se); 1379 plist_for_each_entry(p, head, pushable_tasks) {
1461 if (pick_rt_task(rq, p, cpu)) { 1380 if (pick_rt_task(rq, p, cpu))
1462 next = p; 1381 return p;
1463 break;
1464 }
1465 }
1466 if (!next) {
1467 idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
1468 goto next_idx;
1469 }
1470 } 1382 }
1471 1383
1472 return next; 1384 return NULL;
1473} 1385}
1474 1386
1475static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1387static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
@@ -1743,12 +1655,10 @@ static int pull_rt_task(struct rq *this_rq)
1743 double_lock_balance(this_rq, src_rq); 1655 double_lock_balance(this_rq, src_rq);
1744 1656
1745 /* 1657 /*
1746 * Are there still pullable RT tasks? 1658 * We can pull only a task, which is pushable
1659 * on its rq, and no others.
1747 */ 1660 */
1748 if (src_rq->rt.rt_nr_running <= 1) 1661 p = pick_highest_pushable_task(src_rq, this_cpu);
1749 goto skip;
1750
1751 p = pick_next_highest_task_rt(src_rq, this_cpu);
1752 1662
1753 /* 1663 /*
1754 * Do we have an RT task that preempts 1664 * Do we have an RT task that preempts
@@ -2037,7 +1947,7 @@ static void set_curr_task_rt(struct rq *rq)
2037{ 1947{
2038 struct task_struct *p = rq->curr; 1948 struct task_struct *p = rq->curr;
2039 1949
2040 p->se.exec_start = rq->clock_task; 1950 p->se.exec_start = rq_clock_task(rq);
2041 1951
2042 /* The running task is never eligible for pushing */ 1952 /* The running task is never eligible for pushing */
2043 dequeue_pushable_task(rq, p); 1953 dequeue_pushable_task(rq, p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ce39224d6155..ef0a7b2439dd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -10,8 +10,16 @@
10#include "cpupri.h" 10#include "cpupri.h"
11#include "cpuacct.h" 11#include "cpuacct.h"
12 12
13struct rq;
14
13extern __read_mostly int scheduler_running; 15extern __read_mostly int scheduler_running;
14 16
17extern unsigned long calc_load_update;
18extern atomic_long_t calc_load_tasks;
19
20extern long calc_load_fold_active(struct rq *this_rq);
21extern void update_cpu_load_active(struct rq *this_rq);
22
15/* 23/*
16 * Convert user-nice values [ -20 ... 0 ... 19 ] 24 * Convert user-nice values [ -20 ... 0 ... 19 ]
17 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 25 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -140,10 +148,11 @@ struct task_group {
140 struct cfs_rq **cfs_rq; 148 struct cfs_rq **cfs_rq;
141 unsigned long shares; 149 unsigned long shares;
142 150
143 atomic_t load_weight; 151#ifdef CONFIG_SMP
144 atomic64_t load_avg; 152 atomic_long_t load_avg;
145 atomic_t runnable_avg; 153 atomic_t runnable_avg;
146#endif 154#endif
155#endif
147 156
148#ifdef CONFIG_RT_GROUP_SCHED 157#ifdef CONFIG_RT_GROUP_SCHED
149 struct sched_rt_entity **rt_se; 158 struct sched_rt_entity **rt_se;
@@ -261,26 +270,21 @@ struct cfs_rq {
261#endif 270#endif
262 271
263#ifdef CONFIG_SMP 272#ifdef CONFIG_SMP
264/*
265 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
266 * removed when useful for applications beyond shares distribution (e.g.
267 * load-balance).
268 */
269#ifdef CONFIG_FAIR_GROUP_SCHED
270 /* 273 /*
271 * CFS Load tracking 274 * CFS Load tracking
272 * Under CFS, load is tracked on a per-entity basis and aggregated up. 275 * Under CFS, load is tracked on a per-entity basis and aggregated up.
273 * This allows for the description of both thread and group usage (in 276 * This allows for the description of both thread and group usage (in
274 * the FAIR_GROUP_SCHED case). 277 * the FAIR_GROUP_SCHED case).
275 */ 278 */
276 u64 runnable_load_avg, blocked_load_avg; 279 unsigned long runnable_load_avg, blocked_load_avg;
277 atomic64_t decay_counter, removed_load; 280 atomic64_t decay_counter;
278 u64 last_decay; 281 u64 last_decay;
279#endif /* CONFIG_FAIR_GROUP_SCHED */ 282 atomic_long_t removed_load;
280/* These always depend on CONFIG_FAIR_GROUP_SCHED */ 283
281#ifdef CONFIG_FAIR_GROUP_SCHED 284#ifdef CONFIG_FAIR_GROUP_SCHED
285 /* Required to track per-cpu representation of a task_group */
282 u32 tg_runnable_contrib; 286 u32 tg_runnable_contrib;
283 u64 tg_load_contrib; 287 unsigned long tg_load_contrib;
284#endif /* CONFIG_FAIR_GROUP_SCHED */ 288#endif /* CONFIG_FAIR_GROUP_SCHED */
285 289
286 /* 290 /*
@@ -353,7 +357,6 @@ struct rt_rq {
353 unsigned long rt_nr_boosted; 357 unsigned long rt_nr_boosted;
354 358
355 struct rq *rq; 359 struct rq *rq;
356 struct list_head leaf_rt_rq_list;
357 struct task_group *tg; 360 struct task_group *tg;
358#endif 361#endif
359}; 362};
@@ -540,6 +543,16 @@ DECLARE_PER_CPU(struct rq, runqueues);
540#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 543#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
541#define raw_rq() (&__raw_get_cpu_var(runqueues)) 544#define raw_rq() (&__raw_get_cpu_var(runqueues))
542 545
546static inline u64 rq_clock(struct rq *rq)
547{
548 return rq->clock;
549}
550
551static inline u64 rq_clock_task(struct rq *rq)
552{
553 return rq->clock_task;
554}
555
543#ifdef CONFIG_SMP 556#ifdef CONFIG_SMP
544 557
545#define rcu_dereference_check_sched_domain(p) \ 558#define rcu_dereference_check_sched_domain(p) \
@@ -884,24 +897,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
884#define WF_FORK 0x02 /* child wakeup after fork */ 897#define WF_FORK 0x02 /* child wakeup after fork */
885#define WF_MIGRATED 0x4 /* internal use, task got migrated */ 898#define WF_MIGRATED 0x4 /* internal use, task got migrated */
886 899
887static inline void update_load_add(struct load_weight *lw, unsigned long inc)
888{
889 lw->weight += inc;
890 lw->inv_weight = 0;
891}
892
893static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
894{
895 lw->weight -= dec;
896 lw->inv_weight = 0;
897}
898
899static inline void update_load_set(struct load_weight *lw, unsigned long w)
900{
901 lw->weight = w;
902 lw->inv_weight = 0;
903}
904
905/* 900/*
906 * To aid in avoiding the subversion of "niceness" due to uneven distribution 901 * To aid in avoiding the subversion of "niceness" due to uneven distribution
907 * of tasks with abnormal "nice" values across CPUs the contribution that 902 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1028,17 +1023,8 @@ extern void update_group_power(struct sched_domain *sd, int cpu);
1028extern void trigger_load_balance(struct rq *rq, int cpu); 1023extern void trigger_load_balance(struct rq *rq, int cpu);
1029extern void idle_balance(int this_cpu, struct rq *this_rq); 1024extern void idle_balance(int this_cpu, struct rq *this_rq);
1030 1025
1031/*
1032 * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
1033 * becomes useful in lb
1034 */
1035#if defined(CONFIG_FAIR_GROUP_SCHED)
1036extern void idle_enter_fair(struct rq *this_rq); 1026extern void idle_enter_fair(struct rq *this_rq);
1037extern void idle_exit_fair(struct rq *this_rq); 1027extern void idle_exit_fair(struct rq *this_rq);
1038#else
1039static inline void idle_enter_fair(struct rq *this_rq) {}
1040static inline void idle_exit_fair(struct rq *this_rq) {}
1041#endif
1042 1028
1043#else /* CONFIG_SMP */ 1029#else /* CONFIG_SMP */
1044 1030
@@ -1051,7 +1037,6 @@ static inline void idle_balance(int cpu, struct rq *rq)
1051extern void sysrq_sched_debug_show(void); 1037extern void sysrq_sched_debug_show(void);
1052extern void sched_init_granularity(void); 1038extern void sched_init_granularity(void);
1053extern void update_max_interval(void); 1039extern void update_max_interval(void);
1054extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
1055extern void init_sched_rt_class(void); 1040extern void init_sched_rt_class(void);
1056extern void init_sched_fair_class(void); 1041extern void init_sched_fair_class(void);
1057 1042
@@ -1063,6 +1048,8 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
1063 1048
1064extern void update_idle_cpu_load(struct rq *this_rq); 1049extern void update_idle_cpu_load(struct rq *this_rq);
1065 1050
1051extern void init_task_runnable_average(struct task_struct *p);
1052
1066#ifdef CONFIG_PARAVIRT 1053#ifdef CONFIG_PARAVIRT
1067static inline u64 steal_ticks(u64 steal) 1054static inline u64 steal_ticks(u64 steal)
1068{ 1055{
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 2ef90a51ec5e..17d7065c3872 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -61,7 +61,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
61 */ 61 */
62static inline void sched_info_dequeued(struct task_struct *t) 62static inline void sched_info_dequeued(struct task_struct *t)
63{ 63{
64 unsigned long long now = task_rq(t)->clock, delta = 0; 64 unsigned long long now = rq_clock(task_rq(t)), delta = 0;
65 65
66 if (unlikely(sched_info_on())) 66 if (unlikely(sched_info_on()))
67 if (t->sched_info.last_queued) 67 if (t->sched_info.last_queued)
@@ -79,7 +79,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
79 */ 79 */
80static void sched_info_arrive(struct task_struct *t) 80static void sched_info_arrive(struct task_struct *t)
81{ 81{
82 unsigned long long now = task_rq(t)->clock, delta = 0; 82 unsigned long long now = rq_clock(task_rq(t)), delta = 0;
83 83
84 if (t->sched_info.last_queued) 84 if (t->sched_info.last_queued)
85 delta = now - t->sched_info.last_queued; 85 delta = now - t->sched_info.last_queued;
@@ -100,7 +100,7 @@ static inline void sched_info_queued(struct task_struct *t)
100{ 100{
101 if (unlikely(sched_info_on())) 101 if (unlikely(sched_info_on()))
102 if (!t->sched_info.last_queued) 102 if (!t->sched_info.last_queued)
103 t->sched_info.last_queued = task_rq(t)->clock; 103 t->sched_info.last_queued = rq_clock(task_rq(t));
104} 104}
105 105
106/* 106/*
@@ -112,7 +112,7 @@ static inline void sched_info_queued(struct task_struct *t)
112 */ 112 */
113static inline void sched_info_depart(struct task_struct *t) 113static inline void sched_info_depart(struct task_struct *t)
114{ 114{
115 unsigned long long delta = task_rq(t)->clock - 115 unsigned long long delta = rq_clock(task_rq(t)) -
116 t->sched_info.last_arrival; 116 t->sched_info.last_arrival;
117 117
118 rq_sched_info_depart(task_rq(t), delta); 118 rq_sched_info_depart(task_rq(t), delta);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index da5eb5bed84a..e08fbeeb54b9 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -28,7 +28,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
28 struct task_struct *stop = rq->stop; 28 struct task_struct *stop = rq->stop;
29 29
30 if (stop && stop->on_rq) { 30 if (stop && stop->on_rq) {
31 stop->se.exec_start = rq->clock_task; 31 stop->se.exec_start = rq_clock_task(rq);
32 return stop; 32 return stop;
33 } 33 }
34 34
@@ -57,7 +57,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
57 struct task_struct *curr = rq->curr; 57 struct task_struct *curr = rq->curr;
58 u64 delta_exec; 58 u64 delta_exec;
59 59
60 delta_exec = rq->clock_task - curr->se.exec_start; 60 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
61 if (unlikely((s64)delta_exec < 0)) 61 if (unlikely((s64)delta_exec < 0))
62 delta_exec = 0; 62 delta_exec = 0;
63 63
@@ -67,7 +67,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
67 curr->se.sum_exec_runtime += delta_exec; 67 curr->se.sum_exec_runtime += delta_exec;
68 account_group_exec_runtime(curr, delta_exec); 68 account_group_exec_runtime(curr, delta_exec);
69 69
70 curr->se.exec_start = rq->clock_task; 70 curr->se.exec_start = rq_clock_task(rq);
71 cpuacct_charge(curr, delta_exec); 71 cpuacct_charge(curr, delta_exec);
72} 72}
73 73
@@ -79,7 +79,7 @@ static void set_curr_task_stop(struct rq *rq)
79{ 79{
80 struct task_struct *stop = rq->stop; 80 struct task_struct *stop = rq->stop;
81 81
82 stop->se.exec_start = rq->clock_task; 82 stop->se.exec_start = rq_clock_task(rq);
83} 83}
84 84
85static void switched_to_stop(struct rq *rq, struct task_struct *p) 85static void switched_to_stop(struct rq *rq, struct task_struct *p)
diff --git a/kernel/signal.c b/kernel/signal.c
index 113411bfe8b1..50e41075ac77 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2848,7 +2848,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
2848 recalc_sigpending(); 2848 recalc_sigpending();
2849 spin_unlock_irq(&tsk->sighand->siglock); 2849 spin_unlock_irq(&tsk->sighand->siglock);
2850 2850
2851 timeout = schedule_timeout_interruptible(timeout); 2851 timeout = freezable_schedule_timeout_interruptible(timeout);
2852 2852
2853 spin_lock_irq(&tsk->sighand->siglock); 2853 spin_lock_irq(&tsk->sighand->siglock);
2854 __set_task_blocked(tsk, &tsk->real_blocked); 2854 __set_task_blocked(tsk, &tsk->real_blocked);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b5197dcb0dad..ca25e6e704a2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -127,8 +127,7 @@ static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
127 127
128void local_bh_disable(void) 128void local_bh_disable(void)
129{ 129{
130 __local_bh_disable((unsigned long)__builtin_return_address(0), 130 __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET);
131 SOFTIRQ_DISABLE_OFFSET);
132} 131}
133 132
134EXPORT_SYMBOL(local_bh_disable); 133EXPORT_SYMBOL(local_bh_disable);
@@ -139,7 +138,7 @@ static void __local_bh_enable(unsigned int cnt)
139 WARN_ON_ONCE(!irqs_disabled()); 138 WARN_ON_ONCE(!irqs_disabled());
140 139
141 if (softirq_count() == cnt) 140 if (softirq_count() == cnt)
142 trace_softirqs_on((unsigned long)__builtin_return_address(0)); 141 trace_softirqs_on(_RET_IP_);
143 sub_preempt_count(cnt); 142 sub_preempt_count(cnt);
144} 143}
145 144
@@ -184,7 +183,7 @@ static inline void _local_bh_enable_ip(unsigned long ip)
184 183
185void local_bh_enable(void) 184void local_bh_enable(void)
186{ 185{
187 _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); 186 _local_bh_enable_ip(_RET_IP_);
188} 187}
189EXPORT_SYMBOL(local_bh_enable); 188EXPORT_SYMBOL(local_bh_enable);
190 189
@@ -195,8 +194,12 @@ void local_bh_enable_ip(unsigned long ip)
195EXPORT_SYMBOL(local_bh_enable_ip); 194EXPORT_SYMBOL(local_bh_enable_ip);
196 195
197/* 196/*
198 * We restart softirq processing for at most 2 ms, 197 * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
199 * and if need_resched() is not set. 198 * but break the loop if need_resched() is set or after 2 ms.
199 * The MAX_SOFTIRQ_TIME provides a nice upper bound in most cases, but in
200 * certain cases, such as stop_machine(), jiffies may cease to
201 * increment and so we need the MAX_SOFTIRQ_RESTART limit as
202 * well to make sure we eventually return from this method.
200 * 203 *
201 * These limits have been established via experimentation. 204 * These limits have been established via experimentation.
202 * The two things to balance is latency against fairness - 205 * The two things to balance is latency against fairness -
@@ -204,6 +207,7 @@ EXPORT_SYMBOL(local_bh_enable_ip);
204 * should not be able to lock up the box. 207 * should not be able to lock up the box.
205 */ 208 */
206#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) 209#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2)
210#define MAX_SOFTIRQ_RESTART 10
207 211
208asmlinkage void __do_softirq(void) 212asmlinkage void __do_softirq(void)
209{ 213{
@@ -212,6 +216,7 @@ asmlinkage void __do_softirq(void)
212 unsigned long end = jiffies + MAX_SOFTIRQ_TIME; 216 unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
213 int cpu; 217 int cpu;
214 unsigned long old_flags = current->flags; 218 unsigned long old_flags = current->flags;
219 int max_restart = MAX_SOFTIRQ_RESTART;
215 220
216 /* 221 /*
217 * Mask out PF_MEMALLOC s current task context is borrowed for the 222 * Mask out PF_MEMALLOC s current task context is borrowed for the
@@ -223,8 +228,7 @@ asmlinkage void __do_softirq(void)
223 pending = local_softirq_pending(); 228 pending = local_softirq_pending();
224 account_irq_enter_time(current); 229 account_irq_enter_time(current);
225 230
226 __local_bh_disable((unsigned long)__builtin_return_address(0), 231 __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET);
227 SOFTIRQ_OFFSET);
228 lockdep_softirq_enter(); 232 lockdep_softirq_enter();
229 233
230 cpu = smp_processor_id(); 234 cpu = smp_processor_id();
@@ -265,7 +269,8 @@ restart:
265 269
266 pending = local_softirq_pending(); 270 pending = local_softirq_pending();
267 if (pending) { 271 if (pending) {
268 if (time_before(jiffies, end) && !need_resched()) 272 if (time_before(jiffies, end) && !need_resched() &&
273 --max_restart)
269 goto restart; 274 goto restart;
270 275
271 wakeup_softirqd(); 276 wakeup_softirqd();
diff --git a/kernel/sys.c b/kernel/sys.c
index b95d3c72ba21..071de900c824 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -362,6 +362,29 @@ int unregister_reboot_notifier(struct notifier_block *nb)
362} 362}
363EXPORT_SYMBOL(unregister_reboot_notifier); 363EXPORT_SYMBOL(unregister_reboot_notifier);
364 364
365/* Add backwards compatibility for stable trees. */
366#ifndef PF_NO_SETAFFINITY
367#define PF_NO_SETAFFINITY PF_THREAD_BOUND
368#endif
369
370static void migrate_to_reboot_cpu(void)
371{
372 /* The boot cpu is always logical cpu 0 */
373 int cpu = 0;
374
375 cpu_hotplug_disable();
376
377 /* Make certain the cpu I'm about to reboot on is online */
378 if (!cpu_online(cpu))
379 cpu = cpumask_first(cpu_online_mask);
380
381 /* Prevent races with other tasks migrating this task */
382 current->flags |= PF_NO_SETAFFINITY;
383
384 /* Make certain I only run on the appropriate processor */
385 set_cpus_allowed_ptr(current, cpumask_of(cpu));
386}
387
365/** 388/**
366 * kernel_restart - reboot the system 389 * kernel_restart - reboot the system
367 * @cmd: pointer to buffer containing command to execute for restart 390 * @cmd: pointer to buffer containing command to execute for restart
@@ -373,7 +396,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
373void kernel_restart(char *cmd) 396void kernel_restart(char *cmd)
374{ 397{
375 kernel_restart_prepare(cmd); 398 kernel_restart_prepare(cmd);
376 disable_nonboot_cpus(); 399 migrate_to_reboot_cpu();
377 syscore_shutdown(); 400 syscore_shutdown();
378 if (!cmd) 401 if (!cmd)
379 printk(KERN_EMERG "Restarting system.\n"); 402 printk(KERN_EMERG "Restarting system.\n");
@@ -400,7 +423,7 @@ static void kernel_shutdown_prepare(enum system_states state)
400void kernel_halt(void) 423void kernel_halt(void)
401{ 424{
402 kernel_shutdown_prepare(SYSTEM_HALT); 425 kernel_shutdown_prepare(SYSTEM_HALT);
403 disable_nonboot_cpus(); 426 migrate_to_reboot_cpu();
404 syscore_shutdown(); 427 syscore_shutdown();
405 printk(KERN_EMERG "System halted.\n"); 428 printk(KERN_EMERG "System halted.\n");
406 kmsg_dump(KMSG_DUMP_HALT); 429 kmsg_dump(KMSG_DUMP_HALT);
@@ -419,7 +442,7 @@ void kernel_power_off(void)
419 kernel_shutdown_prepare(SYSTEM_POWER_OFF); 442 kernel_shutdown_prepare(SYSTEM_POWER_OFF);
420 if (pm_power_off_prepare) 443 if (pm_power_off_prepare)
421 pm_power_off_prepare(); 444 pm_power_off_prepare();
422 disable_nonboot_cpus(); 445 migrate_to_reboot_cpu();
423 syscore_shutdown(); 446 syscore_shutdown();
424 printk(KERN_EMERG "Power down.\n"); 447 printk(KERN_EMERG "Power down.\n");
425 kmsg_dump(KMSG_DUMP_POWEROFF); 448 kmsg_dump(KMSG_DUMP_POWEROFF);
@@ -488,7 +511,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
488 case LINUX_REBOOT_CMD_HALT: 511 case LINUX_REBOOT_CMD_HALT:
489 kernel_halt(); 512 kernel_halt();
490 do_exit(0); 513 do_exit(0);
491 panic("cannot halt"); 514 panic("cannot halt.\n");
492 515
493 case LINUX_REBOOT_CMD_POWER_OFF: 516 case LINUX_REBOOT_CMD_POWER_OFF:
494 kernel_power_off(); 517 kernel_power_off();
@@ -1286,6 +1309,17 @@ out:
1286 return retval; 1309 return retval;
1287} 1310}
1288 1311
1312static void set_special_pids(struct pid *pid)
1313{
1314 struct task_struct *curr = current->group_leader;
1315
1316 if (task_session(curr) != pid)
1317 change_pid(curr, PIDTYPE_SID, pid);
1318
1319 if (task_pgrp(curr) != pid)
1320 change_pid(curr, PIDTYPE_PGID, pid);
1321}
1322
1289SYSCALL_DEFINE0(setsid) 1323SYSCALL_DEFINE0(setsid)
1290{ 1324{
1291 struct task_struct *group_leader = current->group_leader; 1325 struct task_struct *group_leader = current->group_leader;
@@ -1305,7 +1339,7 @@ SYSCALL_DEFINE0(setsid)
1305 goto out; 1339 goto out;
1306 1340
1307 group_leader->signal->leader = 1; 1341 group_leader->signal->leader = 1;
1308 __set_special_pids(sid); 1342 set_special_pids(sid);
1309 1343
1310 proc_clear_tty(group_leader); 1344 proc_clear_tty(group_leader);
1311 1345
@@ -2332,8 +2366,7 @@ static int do_sysinfo(struct sysinfo *info)
2332 2366
2333 memset(info, 0, sizeof(struct sysinfo)); 2367 memset(info, 0, sizeof(struct sysinfo));
2334 2368
2335 ktime_get_ts(&tp); 2369 get_monotonic_boottime(&tp);
2336 monotonic_to_bootbased(&tp);
2337 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); 2370 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
2338 2371
2339 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); 2372 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9edcf456e0fc..4ce13c3cedb9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -120,7 +120,6 @@ extern int blk_iopoll_enabled;
120/* Constants used for minimum and maximum */ 120/* Constants used for minimum and maximum */
121#ifdef CONFIG_LOCKUP_DETECTOR 121#ifdef CONFIG_LOCKUP_DETECTOR
122static int sixty = 60; 122static int sixty = 60;
123static int neg_one = -1;
124#endif 123#endif
125 124
126static int zero; 125static int zero;
@@ -814,7 +813,7 @@ static struct ctl_table kern_table[] = {
814 .maxlen = sizeof(int), 813 .maxlen = sizeof(int),
815 .mode = 0644, 814 .mode = 0644,
816 .proc_handler = proc_dowatchdog, 815 .proc_handler = proc_dowatchdog,
817 .extra1 = &neg_one, 816 .extra1 = &zero,
818 .extra2 = &sixty, 817 .extra2 = &sixty,
819 }, 818 },
820 { 819 {
@@ -1044,6 +1043,15 @@ static struct ctl_table kern_table[] = {
1044 .mode = 0644, 1043 .mode = 0644,
1045 .proc_handler = perf_proc_update_handler, 1044 .proc_handler = perf_proc_update_handler,
1046 }, 1045 },
1046 {
1047 .procname = "perf_cpu_time_max_percent",
1048 .data = &sysctl_perf_cpu_time_max_percent,
1049 .maxlen = sizeof(sysctl_perf_cpu_time_max_percent),
1050 .mode = 0644,
1051 .proc_handler = perf_cpu_time_max_percent_handler,
1052 .extra1 = &zero,
1053 .extra2 = &one_hundred,
1054 },
1047#endif 1055#endif
1048#ifdef CONFIG_KMEMCHECK 1056#ifdef CONFIG_KMEMCHECK
1049 { 1057 {
diff --git a/kernel/time.c b/kernel/time.c
index d3617dbd3dca..7c7964c33ae7 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -11,7 +11,7 @@
11 * Modification history kernel/time.c 11 * Modification history kernel/time.c
12 * 12 *
13 * 1993-09-02 Philip Gladstone 13 * 1993-09-02 Philip Gladstone
14 * Created file with time related functions from sched.c and adjtimex() 14 * Created file with time related functions from sched/core.c and adjtimex()
15 * 1993-10-08 Torsten Duwe 15 * 1993-10-08 Torsten Duwe
16 * adjtime interface update and CMOS clock write code 16 * adjtime interface update and CMOS clock write code
17 * 1995-08-13 Torsten Duwe 17 * 1995-08-13 Torsten Duwe
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 0c739423b0f9..20d6fba70652 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -599,8 +599,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)
599 } else { 599 } else {
600 if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { 600 if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
601 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 601 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
602 if (dev->next_event.tv64 == KTIME_MAX)
603 goto out;
604 /* 602 /*
605 * The cpu which was handling the broadcast 603 * The cpu which was handling the broadcast
606 * timer marked this cpu in the broadcast 604 * timer marked this cpu in the broadcast
@@ -615,6 +613,11 @@ void tick_broadcast_oneshot_control(unsigned long reason)
615 goto out; 613 goto out;
616 614
617 /* 615 /*
616 * Bail out if there is no next event.
617 */
618 if (dev->next_event.tv64 == KTIME_MAX)
619 goto out;
620 /*
618 * If the pending bit is not set, then we are 621 * If the pending bit is not set, then we are
619 * either the CPU handling the broadcast 622 * either the CPU handling the broadcast
620 * interrupt or we got woken by something else. 623 * interrupt or we got woken by something else.
@@ -698,10 +701,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
698 701
699 bc->event_handler = tick_handle_oneshot_broadcast; 702 bc->event_handler = tick_handle_oneshot_broadcast;
700 703
701 /* Take the do_timer update */
702 if (!tick_nohz_full_cpu(cpu))
703 tick_do_timer_cpu = cpu;
704
705 /* 704 /*
706 * We must be careful here. There might be other CPUs 705 * We must be careful here. There might be other CPUs
707 * waiting for periodic broadcast. We need to set the 706 * waiting for periodic broadcast. We need to set the
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f4208138fbf4..0cf1c1453181 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -306,7 +306,7 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
306 * we can't safely shutdown that CPU. 306 * we can't safely shutdown that CPU.
307 */ 307 */
308 if (have_nohz_full_mask && tick_do_timer_cpu == cpu) 308 if (have_nohz_full_mask && tick_do_timer_cpu == cpu)
309 return -EINVAL; 309 return NOTIFY_BAD;
310 break; 310 break;
311 } 311 }
312 return NOTIFY_OK; 312 return NOTIFY_OK;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1a41023a1f88..e71a8be4a6ee 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -652,8 +652,6 @@ static struct {
652 ARCH_TRACE_CLOCKS 652 ARCH_TRACE_CLOCKS
653}; 653};
654 654
655int trace_clock_id;
656
657/* 655/*
658 * trace_parser_get_init - gets the buffer for trace parser 656 * trace_parser_get_init - gets the buffer for trace parser
659 */ 657 */
@@ -2826,7 +2824,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
2826 iter->iter_flags |= TRACE_FILE_ANNOTATE; 2824 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2827 2825
2828 /* Output in nanoseconds only if we are using a clock in nanoseconds. */ 2826 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
2829 if (trace_clocks[trace_clock_id].in_ns) 2827 if (trace_clocks[tr->clock_id].in_ns)
2830 iter->iter_flags |= TRACE_FILE_TIME_IN_NS; 2828 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
2831 2829
2832 /* stop the trace while dumping if we are not opening "snapshot" */ 2830 /* stop the trace while dumping if we are not opening "snapshot" */
@@ -3825,7 +3823,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3825 iter->iter_flags |= TRACE_FILE_LAT_FMT; 3823 iter->iter_flags |= TRACE_FILE_LAT_FMT;
3826 3824
3827 /* Output in nanoseconds only if we are using a clock in nanoseconds. */ 3825 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
3828 if (trace_clocks[trace_clock_id].in_ns) 3826 if (trace_clocks[tr->clock_id].in_ns)
3829 iter->iter_flags |= TRACE_FILE_TIME_IN_NS; 3827 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
3830 3828
3831 iter->cpu_file = tc->cpu; 3829 iter->cpu_file = tc->cpu;
@@ -5095,7 +5093,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
5095 cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); 5093 cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);
5096 trace_seq_printf(s, "bytes: %ld\n", cnt); 5094 trace_seq_printf(s, "bytes: %ld\n", cnt);
5097 5095
5098 if (trace_clocks[trace_clock_id].in_ns) { 5096 if (trace_clocks[tr->clock_id].in_ns) {
5099 /* local or global for trace_clock */ 5097 /* local or global for trace_clock */
5100 t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); 5098 t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
5101 usec_rem = do_div(t, USEC_PER_SEC); 5099 usec_rem = do_div(t, USEC_PER_SEC);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 711ca7d3e7f1..20572ed88c5c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -700,8 +700,6 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
700 700
701extern unsigned long trace_flags; 701extern unsigned long trace_flags;
702 702
703extern int trace_clock_id;
704
705/* Standard output formatting function used for function return traces */ 703/* Standard output formatting function used for function return traces */
706#ifdef CONFIG_FUNCTION_GRAPH_TRACER 704#ifdef CONFIG_FUNCTION_GRAPH_TRACER
707 705
diff --git a/kernel/wait.c b/kernel/wait.c
index 6698e0c04ead..ce0daa320a26 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -287,3 +287,91 @@ wait_queue_head_t *bit_waitqueue(void *word, int bit)
287 return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; 287 return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
288} 288}
289EXPORT_SYMBOL(bit_waitqueue); 289EXPORT_SYMBOL(bit_waitqueue);
290
291/*
292 * Manipulate the atomic_t address to produce a better bit waitqueue table hash
293 * index (we're keying off bit -1, but that would produce a horrible hash
294 * value).
295 */
296static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
297{
298 if (BITS_PER_LONG == 64) {
299 unsigned long q = (unsigned long)p;
300 return bit_waitqueue((void *)(q & ~1), q & 1);
301 }
302 return bit_waitqueue(p, 0);
303}
304
305static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync,
306 void *arg)
307{
308 struct wait_bit_key *key = arg;
309 struct wait_bit_queue *wait_bit
310 = container_of(wait, struct wait_bit_queue, wait);
311 atomic_t *val = key->flags;
312
313 if (wait_bit->key.flags != key->flags ||
314 wait_bit->key.bit_nr != key->bit_nr ||
315 atomic_read(val) != 0)
316 return 0;
317 return autoremove_wake_function(wait, mode, sync, key);
318}
319
320/*
321 * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
322 * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero
323 * return codes halt waiting and return.
324 */
325static __sched
326int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q,
327 int (*action)(atomic_t *), unsigned mode)
328{
329 atomic_t *val;
330 int ret = 0;
331
332 do {
333 prepare_to_wait(wq, &q->wait, mode);
334 val = q->key.flags;
335 if (atomic_read(val) == 0)
336 ret = (*action)(val);
337 } while (!ret && atomic_read(val) != 0);
338 finish_wait(wq, &q->wait);
339 return ret;
340}
341
342#define DEFINE_WAIT_ATOMIC_T(name, p) \
343 struct wait_bit_queue name = { \
344 .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \
345 .wait = { \
346 .private = current, \
347 .func = wake_atomic_t_function, \
348 .task_list = \
349 LIST_HEAD_INIT((name).wait.task_list), \
350 }, \
351 }
352
353__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
354 unsigned mode)
355{
356 wait_queue_head_t *wq = atomic_t_waitqueue(p);
357 DEFINE_WAIT_ATOMIC_T(wait, p);
358
359 return __wait_on_atomic_t(wq, &wait, action, mode);
360}
361EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
362
363/**
364 * wake_up_atomic_t - Wake up a waiter on a atomic_t
365 * @word: The word being waited on, a kernel virtual address
366 * @bit: The bit of the word being waited on
367 *
368 * Wake up anyone waiting for the atomic_t to go to zero.
369 *
370 * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
371 * check is done by the waiter's wake function, not the by the waker itself).
372 */
373void wake_up_atomic_t(atomic_t *p)
374{
375 __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
376}
377EXPORT_SYMBOL(wake_up_atomic_t);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ee8e29a2320c..f02c4a4a0c3c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -272,6 +272,15 @@ static cpumask_var_t *wq_numa_possible_cpumask;
272static bool wq_disable_numa; 272static bool wq_disable_numa;
273module_param_named(disable_numa, wq_disable_numa, bool, 0444); 273module_param_named(disable_numa, wq_disable_numa, bool, 0444);
274 274
275/* see the comment above the definition of WQ_POWER_EFFICIENT */
276#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT
277static bool wq_power_efficient = true;
278#else
279static bool wq_power_efficient;
280#endif
281
282module_param_named(power_efficient, wq_power_efficient, bool, 0444);
283
275static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ 284static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
276 285
277/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ 286/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
@@ -305,6 +314,10 @@ struct workqueue_struct *system_unbound_wq __read_mostly;
305EXPORT_SYMBOL_GPL(system_unbound_wq); 314EXPORT_SYMBOL_GPL(system_unbound_wq);
306struct workqueue_struct *system_freezable_wq __read_mostly; 315struct workqueue_struct *system_freezable_wq __read_mostly;
307EXPORT_SYMBOL_GPL(system_freezable_wq); 316EXPORT_SYMBOL_GPL(system_freezable_wq);
317struct workqueue_struct *system_power_efficient_wq __read_mostly;
318EXPORT_SYMBOL_GPL(system_power_efficient_wq);
319struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
320EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
308 321
309static int worker_thread(void *__worker); 322static int worker_thread(void *__worker);
310static void copy_workqueue_attrs(struct workqueue_attrs *to, 323static void copy_workqueue_attrs(struct workqueue_attrs *to,
@@ -4086,6 +4099,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
4086 struct workqueue_struct *wq; 4099 struct workqueue_struct *wq;
4087 struct pool_workqueue *pwq; 4100 struct pool_workqueue *pwq;
4088 4101
4102 /* see the comment above the definition of WQ_POWER_EFFICIENT */
4103 if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
4104 flags |= WQ_UNBOUND;
4105
4089 /* allocate wq and format name */ 4106 /* allocate wq and format name */
4090 if (flags & WQ_UNBOUND) 4107 if (flags & WQ_UNBOUND)
4091 tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); 4108 tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
@@ -4985,8 +5002,15 @@ static int __init init_workqueues(void)
4985 WQ_UNBOUND_MAX_ACTIVE); 5002 WQ_UNBOUND_MAX_ACTIVE);
4986 system_freezable_wq = alloc_workqueue("events_freezable", 5003 system_freezable_wq = alloc_workqueue("events_freezable",
4987 WQ_FREEZABLE, 0); 5004 WQ_FREEZABLE, 0);
5005 system_power_efficient_wq = alloc_workqueue("events_power_efficient",
5006 WQ_POWER_EFFICIENT, 0);
5007 system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
5008 WQ_FREEZABLE | WQ_POWER_EFFICIENT,
5009 0);
4988 BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || 5010 BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
4989 !system_unbound_wq || !system_freezable_wq); 5011 !system_unbound_wq || !system_freezable_wq ||
5012 !system_power_efficient_wq ||
5013 !system_freezable_power_efficient_wq);
4990 return 0; 5014 return 0;
4991} 5015}
4992early_initcall(init_workqueues); 5016early_initcall(init_workqueues);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index ad83c96b2ece..7e2204db0b1a 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -64,7 +64,7 @@ static inline struct worker *current_wq_worker(void)
64 64
65/* 65/*
66 * Scheduler hooks for concurrency managed workqueue. Only to be used from 66 * Scheduler hooks for concurrency managed workqueue. Only to be used from
67 * sched.c and workqueue.c. 67 * sched/core.c and workqueue.c.
68 */ 68 */
69void wq_worker_waking_up(struct task_struct *task, int cpu); 69void wq_worker_waking_up(struct task_struct *task, int cpu);
70struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); 70struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);