aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-10-26 11:08:43 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-10-26 11:08:43 -0400
commit8a4a8918ed6e4a361f4df19f199bbc2d0a89a46c (patch)
treed76974986aaaa8549baf2d6a106fa6cb60d64b88 /kernel
parent8686a0e200419322654a75155e2e6f80346a1297 (diff)
parent540f41edc15473ca3b2876de72646546ae101374 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (46 commits) llist: Add back llist_add_batch() and llist_del_first() prototypes sched: Don't use tasklist_lock for debug prints sched: Warn on rt throttling sched: Unify the ->cpus_allowed mask copy sched: Wrap scheduler p->cpus_allowed access sched: Request for idle balance during nohz idle load balance sched: Use resched IPI to kick off the nohz idle balance sched: Fix idle_cpu() llist: Remove cpu_relax() usage in cmpxchg loops sched: Convert to struct llist llist: Add llist_next() irq_work: Use llist in the struct irq_work logic llist: Return whether list is empty before adding in llist_add() llist: Move cpu_relax() to after the cmpxchg() llist: Remove the platform-dependent NMI checks llist: Make some llist functions inline sched, tracing: Show PREEMPT_ACTIVE state in trace_sched_switch sched: Remove redundant test in check_preempt_tick() sched: Add documentation for bandwidth control sched: Return unused runtime on group dequeue ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/irq_work.c91
-rw-r--r--kernel/sched.c666
-rw-r--r--kernel/sched_cpupri.c89
-rw-r--r--kernel/sched_cpupri.h7
-rw-r--r--kernel/sched_fair.c761
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_rt.c99
-rw-r--r--kernel/sched_stoptask.c2
-rw-r--r--kernel/sysctl.c10
9 files changed, 1404 insertions, 326 deletions
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index c58fa7da8aef..0e2cde4f380b 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -17,54 +17,34 @@
17 * claimed NULL, 3 -> {pending} : claimed to be enqueued 17 * claimed NULL, 3 -> {pending} : claimed to be enqueued
18 * pending next, 3 -> {busy} : queued, pending callback 18 * pending next, 3 -> {busy} : queued, pending callback
19 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed 19 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
20 *
21 * We use the lower two bits of the next pointer to keep PENDING and BUSY
22 * flags.
23 */ 20 */
24 21
25#define IRQ_WORK_PENDING 1UL 22#define IRQ_WORK_PENDING 1UL
26#define IRQ_WORK_BUSY 2UL 23#define IRQ_WORK_BUSY 2UL
27#define IRQ_WORK_FLAGS 3UL 24#define IRQ_WORK_FLAGS 3UL
28 25
29static inline bool irq_work_is_set(struct irq_work *entry, int flags) 26static DEFINE_PER_CPU(struct llist_head, irq_work_list);
30{
31 return (unsigned long)entry->next & flags;
32}
33
34static inline struct irq_work *irq_work_next(struct irq_work *entry)
35{
36 unsigned long next = (unsigned long)entry->next;
37 next &= ~IRQ_WORK_FLAGS;
38 return (struct irq_work *)next;
39}
40
41static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
42{
43 unsigned long next = (unsigned long)entry;
44 next |= flags;
45 return (struct irq_work *)next;
46}
47
48static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
49 27
50/* 28/*
51 * Claim the entry so that no one else will poke at it. 29 * Claim the entry so that no one else will poke at it.
52 */ 30 */
53static bool irq_work_claim(struct irq_work *entry) 31static bool irq_work_claim(struct irq_work *work)
54{ 32{
55 struct irq_work *next, *nflags; 33 unsigned long flags, nflags;
56 34
57 do { 35 for (;;) {
58 next = entry->next; 36 flags = work->flags;
59 if ((unsigned long)next & IRQ_WORK_PENDING) 37 if (flags & IRQ_WORK_PENDING)
60 return false; 38 return false;
61 nflags = next_flags(next, IRQ_WORK_FLAGS); 39 nflags = flags | IRQ_WORK_FLAGS;
62 } while (cmpxchg(&entry->next, next, nflags) != next); 40 if (cmpxchg(&work->flags, flags, nflags) == flags)
41 break;
42 cpu_relax();
43 }
63 44
64 return true; 45 return true;
65} 46}
66 47
67
68void __weak arch_irq_work_raise(void) 48void __weak arch_irq_work_raise(void)
69{ 49{
70 /* 50 /*
@@ -75,20 +55,15 @@ void __weak arch_irq_work_raise(void)
75/* 55/*
76 * Queue the entry and raise the IPI if needed. 56 * Queue the entry and raise the IPI if needed.
77 */ 57 */
78static void __irq_work_queue(struct irq_work *entry) 58static void __irq_work_queue(struct irq_work *work)
79{ 59{
80 struct irq_work *next; 60 bool empty;
81 61
82 preempt_disable(); 62 preempt_disable();
83 63
84 do { 64 empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
85 next = __this_cpu_read(irq_work_list);
86 /* Can assign non-atomic because we keep the flags set. */
87 entry->next = next_flags(next, IRQ_WORK_FLAGS);
88 } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
89
90 /* The list was empty, raise self-interrupt to start processing. */ 65 /* The list was empty, raise self-interrupt to start processing. */
91 if (!irq_work_next(entry)) 66 if (empty)
92 arch_irq_work_raise(); 67 arch_irq_work_raise();
93 68
94 preempt_enable(); 69 preempt_enable();
@@ -100,16 +75,16 @@ static void __irq_work_queue(struct irq_work *entry)
100 * 75 *
101 * Can be re-enqueued while the callback is still in progress. 76 * Can be re-enqueued while the callback is still in progress.
102 */ 77 */
103bool irq_work_queue(struct irq_work *entry) 78bool irq_work_queue(struct irq_work *work)
104{ 79{
105 if (!irq_work_claim(entry)) { 80 if (!irq_work_claim(work)) {
106 /* 81 /*
107 * Already enqueued, can't do! 82 * Already enqueued, can't do!
108 */ 83 */
109 return false; 84 return false;
110 } 85 }
111 86
112 __irq_work_queue(entry); 87 __irq_work_queue(work);
113 return true; 88 return true;
114} 89}
115EXPORT_SYMBOL_GPL(irq_work_queue); 90EXPORT_SYMBOL_GPL(irq_work_queue);
@@ -120,34 +95,34 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
120 */ 95 */
121void irq_work_run(void) 96void irq_work_run(void)
122{ 97{
123 struct irq_work *list; 98 struct irq_work *work;
99 struct llist_head *this_list;
100 struct llist_node *llnode;
124 101
125 if (this_cpu_read(irq_work_list) == NULL) 102 this_list = &__get_cpu_var(irq_work_list);
103 if (llist_empty(this_list))
126 return; 104 return;
127 105
128 BUG_ON(!in_irq()); 106 BUG_ON(!in_irq());
129 BUG_ON(!irqs_disabled()); 107 BUG_ON(!irqs_disabled());
130 108
131 list = this_cpu_xchg(irq_work_list, NULL); 109 llnode = llist_del_all(this_list);
132 110 while (llnode != NULL) {
133 while (list != NULL) { 111 work = llist_entry(llnode, struct irq_work, llnode);
134 struct irq_work *entry = list;
135 112
136 list = irq_work_next(list); 113 llnode = llist_next(llnode);
137 114
138 /* 115 /*
139 * Clear the PENDING bit, after this point the @entry 116 * Clear the PENDING bit, after this point the @work
140 * can be re-used. 117 * can be re-used.
141 */ 118 */
142 entry->next = next_flags(NULL, IRQ_WORK_BUSY); 119 work->flags = IRQ_WORK_BUSY;
143 entry->func(entry); 120 work->func(work);
144 /* 121 /*
145 * Clear the BUSY bit and return to the free state if 122 * Clear the BUSY bit and return to the free state if
146 * no-one else claimed it meanwhile. 123 * no-one else claimed it meanwhile.
147 */ 124 */
148 (void)cmpxchg(&entry->next, 125 (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0);
149 next_flags(NULL, IRQ_WORK_BUSY),
150 NULL);
151 } 126 }
152} 127}
153EXPORT_SYMBOL_GPL(irq_work_run); 128EXPORT_SYMBOL_GPL(irq_work_run);
@@ -156,11 +131,11 @@ EXPORT_SYMBOL_GPL(irq_work_run);
156 * Synchronize against the irq_work @entry, ensures the entry is not 131 * Synchronize against the irq_work @entry, ensures the entry is not
157 * currently in use. 132 * currently in use.
158 */ 133 */
159void irq_work_sync(struct irq_work *entry) 134void irq_work_sync(struct irq_work *work)
160{ 135{
161 WARN_ON_ONCE(irqs_disabled()); 136 WARN_ON_ONCE(irqs_disabled());
162 137
163 while (irq_work_is_set(entry, IRQ_WORK_BUSY)) 138 while (work->flags & IRQ_WORK_BUSY)
164 cpu_relax(); 139 cpu_relax();
165} 140}
166EXPORT_SYMBOL_GPL(irq_work_sync); 141EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/sched.c b/kernel/sched.c
index 03ad0113801a..d87c6e5d4e8c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void)
196 return sysctl_sched_rt_runtime >= 0; 196 return sysctl_sched_rt_runtime >= 0;
197} 197}
198 198
199static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 199static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
200{ 200{
201 ktime_t now; 201 unsigned long delta;
202 ktime_t soft, hard, now;
203
204 for (;;) {
205 if (hrtimer_active(period_timer))
206 break;
207
208 now = hrtimer_cb_get_time(period_timer);
209 hrtimer_forward(period_timer, now, period);
202 210
211 soft = hrtimer_get_softexpires(period_timer);
212 hard = hrtimer_get_expires(period_timer);
213 delta = ktime_to_ns(ktime_sub(hard, soft));
214 __hrtimer_start_range_ns(period_timer, soft, delta,
215 HRTIMER_MODE_ABS_PINNED, 0);
216 }
217}
218
219static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
220{
203 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 221 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
204 return; 222 return;
205 223
@@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
207 return; 225 return;
208 226
209 raw_spin_lock(&rt_b->rt_runtime_lock); 227 raw_spin_lock(&rt_b->rt_runtime_lock);
210 for (;;) { 228 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
211 unsigned long delta;
212 ktime_t soft, hard;
213
214 if (hrtimer_active(&rt_b->rt_period_timer))
215 break;
216
217 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
218 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
219
220 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
221 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
222 delta = ktime_to_ns(ktime_sub(hard, soft));
223 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
224 HRTIMER_MODE_ABS_PINNED, 0);
225 }
226 raw_spin_unlock(&rt_b->rt_runtime_lock); 229 raw_spin_unlock(&rt_b->rt_runtime_lock);
227} 230}
228 231
@@ -247,6 +250,24 @@ struct cfs_rq;
247 250
248static LIST_HEAD(task_groups); 251static LIST_HEAD(task_groups);
249 252
253struct cfs_bandwidth {
254#ifdef CONFIG_CFS_BANDWIDTH
255 raw_spinlock_t lock;
256 ktime_t period;
257 u64 quota, runtime;
258 s64 hierarchal_quota;
259 u64 runtime_expires;
260
261 int idle, timer_active;
262 struct hrtimer period_timer, slack_timer;
263 struct list_head throttled_cfs_rq;
264
265 /* statistics */
266 int nr_periods, nr_throttled;
267 u64 throttled_time;
268#endif
269};
270
250/* task group related information */ 271/* task group related information */
251struct task_group { 272struct task_group {
252 struct cgroup_subsys_state css; 273 struct cgroup_subsys_state css;
@@ -278,6 +299,8 @@ struct task_group {
278#ifdef CONFIG_SCHED_AUTOGROUP 299#ifdef CONFIG_SCHED_AUTOGROUP
279 struct autogroup *autogroup; 300 struct autogroup *autogroup;
280#endif 301#endif
302
303 struct cfs_bandwidth cfs_bandwidth;
281}; 304};
282 305
283/* task_group_lock serializes the addition/removal of task groups */ 306/* task_group_lock serializes the addition/removal of task groups */
@@ -311,7 +334,7 @@ struct task_group root_task_group;
311/* CFS-related fields in a runqueue */ 334/* CFS-related fields in a runqueue */
312struct cfs_rq { 335struct cfs_rq {
313 struct load_weight load; 336 struct load_weight load;
314 unsigned long nr_running; 337 unsigned long nr_running, h_nr_running;
315 338
316 u64 exec_clock; 339 u64 exec_clock;
317 u64 min_vruntime; 340 u64 min_vruntime;
@@ -377,9 +400,120 @@ struct cfs_rq {
377 400
378 unsigned long load_contribution; 401 unsigned long load_contribution;
379#endif 402#endif
403#ifdef CONFIG_CFS_BANDWIDTH
404 int runtime_enabled;
405 u64 runtime_expires;
406 s64 runtime_remaining;
407
408 u64 throttled_timestamp;
409 int throttled, throttle_count;
410 struct list_head throttled_list;
411#endif
380#endif 412#endif
381}; 413};
382 414
415#ifdef CONFIG_FAIR_GROUP_SCHED
416#ifdef CONFIG_CFS_BANDWIDTH
417static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
418{
419 return &tg->cfs_bandwidth;
420}
421
422static inline u64 default_cfs_period(void);
423static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
424static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
425
426static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
427{
428 struct cfs_bandwidth *cfs_b =
429 container_of(timer, struct cfs_bandwidth, slack_timer);
430 do_sched_cfs_slack_timer(cfs_b);
431
432 return HRTIMER_NORESTART;
433}
434
435static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
436{
437 struct cfs_bandwidth *cfs_b =
438 container_of(timer, struct cfs_bandwidth, period_timer);
439 ktime_t now;
440 int overrun;
441 int idle = 0;
442
443 for (;;) {
444 now = hrtimer_cb_get_time(timer);
445 overrun = hrtimer_forward(timer, now, cfs_b->period);
446
447 if (!overrun)
448 break;
449
450 idle = do_sched_cfs_period_timer(cfs_b, overrun);
451 }
452
453 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
454}
455
456static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
457{
458 raw_spin_lock_init(&cfs_b->lock);
459 cfs_b->runtime = 0;
460 cfs_b->quota = RUNTIME_INF;
461 cfs_b->period = ns_to_ktime(default_cfs_period());
462
463 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
464 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
465 cfs_b->period_timer.function = sched_cfs_period_timer;
466 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
467 cfs_b->slack_timer.function = sched_cfs_slack_timer;
468}
469
470static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
471{
472 cfs_rq->runtime_enabled = 0;
473 INIT_LIST_HEAD(&cfs_rq->throttled_list);
474}
475
476/* requires cfs_b->lock, may release to reprogram timer */
477static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
478{
479 /*
480 * The timer may be active because we're trying to set a new bandwidth
481 * period or because we're racing with the tear-down path
482 * (timer_active==0 becomes visible before the hrtimer call-back
483 * terminates). In either case we ensure that it's re-programmed
484 */
485 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
486 raw_spin_unlock(&cfs_b->lock);
487 /* ensure cfs_b->lock is available while we wait */
488 hrtimer_cancel(&cfs_b->period_timer);
489
490 raw_spin_lock(&cfs_b->lock);
491 /* if someone else restarted the timer then we're done */
492 if (cfs_b->timer_active)
493 return;
494 }
495
496 cfs_b->timer_active = 1;
497 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
498}
499
500static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
501{
502 hrtimer_cancel(&cfs_b->period_timer);
503 hrtimer_cancel(&cfs_b->slack_timer);
504}
505#else
506static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
507static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
508static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
509
510static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
511{
512 return NULL;
513}
514#endif /* CONFIG_CFS_BANDWIDTH */
515#endif /* CONFIG_FAIR_GROUP_SCHED */
516
383/* Real-Time classes' related field in a runqueue: */ 517/* Real-Time classes' related field in a runqueue: */
384struct rt_rq { 518struct rt_rq {
385 struct rt_prio_array active; 519 struct rt_prio_array active;
@@ -510,7 +644,7 @@ struct rq {
510 644
511 unsigned long cpu_power; 645 unsigned long cpu_power;
512 646
513 unsigned char idle_at_tick; 647 unsigned char idle_balance;
514 /* For active balancing */ 648 /* For active balancing */
515 int post_schedule; 649 int post_schedule;
516 int active_balance; 650 int active_balance;
@@ -520,8 +654,6 @@ struct rq {
520 int cpu; 654 int cpu;
521 int online; 655 int online;
522 656
523 unsigned long avg_load_per_task;
524
525 u64 rt_avg; 657 u64 rt_avg;
526 u64 age_stamp; 658 u64 age_stamp;
527 u64 idle_stamp; 659 u64 idle_stamp;
@@ -570,7 +702,7 @@ struct rq {
570#endif 702#endif
571 703
572#ifdef CONFIG_SMP 704#ifdef CONFIG_SMP
573 struct task_struct *wake_list; 705 struct llist_head wake_list;
574#endif 706#endif
575}; 707};
576 708
@@ -1272,6 +1404,18 @@ void wake_up_idle_cpu(int cpu)
1272 smp_send_reschedule(cpu); 1404 smp_send_reschedule(cpu);
1273} 1405}
1274 1406
1407static inline bool got_nohz_idle_kick(void)
1408{
1409 return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
1410}
1411
1412#else /* CONFIG_NO_HZ */
1413
1414static inline bool got_nohz_idle_kick(void)
1415{
1416 return false;
1417}
1418
1275#endif /* CONFIG_NO_HZ */ 1419#endif /* CONFIG_NO_HZ */
1276 1420
1277static u64 sched_avg_period(void) 1421static u64 sched_avg_period(void)
@@ -1471,24 +1615,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1471 update_load_sub(&rq->load, load); 1615 update_load_sub(&rq->load, load);
1472} 1616}
1473 1617
1474#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) 1618#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1619 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1475typedef int (*tg_visitor)(struct task_group *, void *); 1620typedef int (*tg_visitor)(struct task_group *, void *);
1476 1621
1477/* 1622/*
1478 * Iterate the full tree, calling @down when first entering a node and @up when 1623 * Iterate task_group tree rooted at *from, calling @down when first entering a
1479 * leaving it for the final time. 1624 * node and @up when leaving it for the final time.
1625 *
1626 * Caller must hold rcu_lock or sufficient equivalent.
1480 */ 1627 */
1481static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) 1628static int walk_tg_tree_from(struct task_group *from,
1629 tg_visitor down, tg_visitor up, void *data)
1482{ 1630{
1483 struct task_group *parent, *child; 1631 struct task_group *parent, *child;
1484 int ret; 1632 int ret;
1485 1633
1486 rcu_read_lock(); 1634 parent = from;
1487 parent = &root_task_group; 1635
1488down: 1636down:
1489 ret = (*down)(parent, data); 1637 ret = (*down)(parent, data);
1490 if (ret) 1638 if (ret)
1491 goto out_unlock; 1639 goto out;
1492 list_for_each_entry_rcu(child, &parent->children, siblings) { 1640 list_for_each_entry_rcu(child, &parent->children, siblings) {
1493 parent = child; 1641 parent = child;
1494 goto down; 1642 goto down;
@@ -1497,19 +1645,29 @@ up:
1497 continue; 1645 continue;
1498 } 1646 }
1499 ret = (*up)(parent, data); 1647 ret = (*up)(parent, data);
1500 if (ret) 1648 if (ret || parent == from)
1501 goto out_unlock; 1649 goto out;
1502 1650
1503 child = parent; 1651 child = parent;
1504 parent = parent->parent; 1652 parent = parent->parent;
1505 if (parent) 1653 if (parent)
1506 goto up; 1654 goto up;
1507out_unlock: 1655out:
1508 rcu_read_unlock();
1509
1510 return ret; 1656 return ret;
1511} 1657}
1512 1658
1659/*
1660 * Iterate the full tree, calling @down when first entering a node and @up when
1661 * leaving it for the final time.
1662 *
1663 * Caller must hold rcu_lock or sufficient equivalent.
1664 */
1665
1666static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1667{
1668 return walk_tg_tree_from(&root_task_group, down, up, data);
1669}
1670
1513static int tg_nop(struct task_group *tg, void *data) 1671static int tg_nop(struct task_group *tg, void *data)
1514{ 1672{
1515 return 0; 1673 return 0;
@@ -1569,11 +1727,9 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1569 unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 1727 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1570 1728
1571 if (nr_running) 1729 if (nr_running)
1572 rq->avg_load_per_task = rq->load.weight / nr_running; 1730 return rq->load.weight / nr_running;
1573 else
1574 rq->avg_load_per_task = 0;
1575 1731
1576 return rq->avg_load_per_task; 1732 return 0;
1577} 1733}
1578 1734
1579#ifdef CONFIG_PREEMPT 1735#ifdef CONFIG_PREEMPT
@@ -1806,7 +1962,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1806 rq->nr_uninterruptible--; 1962 rq->nr_uninterruptible--;
1807 1963
1808 enqueue_task(rq, p, flags); 1964 enqueue_task(rq, p, flags);
1809 inc_nr_running(rq);
1810} 1965}
1811 1966
1812/* 1967/*
@@ -1818,7 +1973,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1818 rq->nr_uninterruptible++; 1973 rq->nr_uninterruptible++;
1819 1974
1820 dequeue_task(rq, p, flags); 1975 dequeue_task(rq, p, flags);
1821 dec_nr_running(rq);
1822} 1976}
1823 1977
1824#ifdef CONFIG_IRQ_TIME_ACCOUNTING 1978#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -2390,11 +2544,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2390 2544
2391 /* Look for allowed, online CPU in same node. */ 2545 /* Look for allowed, online CPU in same node. */
2392 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) 2546 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2393 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 2547 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
2394 return dest_cpu; 2548 return dest_cpu;
2395 2549
2396 /* Any allowed, online CPU? */ 2550 /* Any allowed, online CPU? */
2397 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); 2551 dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
2398 if (dest_cpu < nr_cpu_ids) 2552 if (dest_cpu < nr_cpu_ids)
2399 return dest_cpu; 2553 return dest_cpu;
2400 2554
@@ -2431,7 +2585,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2431 * [ this allows ->select_task() to simply return task_cpu(p) and 2585 * [ this allows ->select_task() to simply return task_cpu(p) and
2432 * not worry about this generic constraint ] 2586 * not worry about this generic constraint ]
2433 */ 2587 */
2434 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || 2588 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
2435 !cpu_online(cpu))) 2589 !cpu_online(cpu)))
2436 cpu = select_fallback_rq(task_cpu(p), p); 2590 cpu = select_fallback_rq(task_cpu(p), p);
2437 2591
@@ -2556,42 +2710,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
2556} 2710}
2557 2711
2558#ifdef CONFIG_SMP 2712#ifdef CONFIG_SMP
2559static void sched_ttwu_do_pending(struct task_struct *list) 2713static void sched_ttwu_pending(void)
2560{ 2714{
2561 struct rq *rq = this_rq(); 2715 struct rq *rq = this_rq();
2716 struct llist_node *llist = llist_del_all(&rq->wake_list);
2717 struct task_struct *p;
2562 2718
2563 raw_spin_lock(&rq->lock); 2719 raw_spin_lock(&rq->lock);
2564 2720
2565 while (list) { 2721 while (llist) {
2566 struct task_struct *p = list; 2722 p = llist_entry(llist, struct task_struct, wake_entry);
2567 list = list->wake_entry; 2723 llist = llist_next(llist);
2568 ttwu_do_activate(rq, p, 0); 2724 ttwu_do_activate(rq, p, 0);
2569 } 2725 }
2570 2726
2571 raw_spin_unlock(&rq->lock); 2727 raw_spin_unlock(&rq->lock);
2572} 2728}
2573 2729
2574#ifdef CONFIG_HOTPLUG_CPU
2575
2576static void sched_ttwu_pending(void)
2577{
2578 struct rq *rq = this_rq();
2579 struct task_struct *list = xchg(&rq->wake_list, NULL);
2580
2581 if (!list)
2582 return;
2583
2584 sched_ttwu_do_pending(list);
2585}
2586
2587#endif /* CONFIG_HOTPLUG_CPU */
2588
2589void scheduler_ipi(void) 2730void scheduler_ipi(void)
2590{ 2731{
2591 struct rq *rq = this_rq(); 2732 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
2592 struct task_struct *list = xchg(&rq->wake_list, NULL);
2593
2594 if (!list)
2595 return; 2733 return;
2596 2734
2597 /* 2735 /*
@@ -2608,25 +2746,21 @@ void scheduler_ipi(void)
2608 * somewhat pessimize the simple resched case. 2746 * somewhat pessimize the simple resched case.
2609 */ 2747 */
2610 irq_enter(); 2748 irq_enter();
2611 sched_ttwu_do_pending(list); 2749 sched_ttwu_pending();
2750
2751 /*
2752 * Check if someone kicked us for doing the nohz idle load balance.
2753 */
2754 if (unlikely(got_nohz_idle_kick() && !need_resched())) {
2755 this_rq()->idle_balance = 1;
2756 raise_softirq_irqoff(SCHED_SOFTIRQ);
2757 }
2612 irq_exit(); 2758 irq_exit();
2613} 2759}
2614 2760
2615static void ttwu_queue_remote(struct task_struct *p, int cpu) 2761static void ttwu_queue_remote(struct task_struct *p, int cpu)
2616{ 2762{
2617 struct rq *rq = cpu_rq(cpu); 2763 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
2618 struct task_struct *next = rq->wake_list;
2619
2620 for (;;) {
2621 struct task_struct *old = next;
2622
2623 p->wake_entry = next;
2624 next = cmpxchg(&rq->wake_list, old, p);
2625 if (next == old)
2626 break;
2627 }
2628
2629 if (!next)
2630 smp_send_reschedule(cpu); 2764 smp_send_reschedule(cpu);
2631} 2765}
2632 2766
@@ -2848,19 +2982,23 @@ void sched_fork(struct task_struct *p)
2848 p->state = TASK_RUNNING; 2982 p->state = TASK_RUNNING;
2849 2983
2850 /* 2984 /*
2985 * Make sure we do not leak PI boosting priority to the child.
2986 */
2987 p->prio = current->normal_prio;
2988
2989 /*
2851 * Revert to default priority/policy on fork if requested. 2990 * Revert to default priority/policy on fork if requested.
2852 */ 2991 */
2853 if (unlikely(p->sched_reset_on_fork)) { 2992 if (unlikely(p->sched_reset_on_fork)) {
2854 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { 2993 if (task_has_rt_policy(p)) {
2855 p->policy = SCHED_NORMAL; 2994 p->policy = SCHED_NORMAL;
2856 p->normal_prio = p->static_prio;
2857 }
2858
2859 if (PRIO_TO_NICE(p->static_prio) < 0) {
2860 p->static_prio = NICE_TO_PRIO(0); 2995 p->static_prio = NICE_TO_PRIO(0);
2861 p->normal_prio = p->static_prio; 2996 p->rt_priority = 0;
2862 set_load_weight(p); 2997 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2863 } 2998 p->static_prio = NICE_TO_PRIO(0);
2999
3000 p->prio = p->normal_prio = __normal_prio(p);
3001 set_load_weight(p);
2864 3002
2865 /* 3003 /*
2866 * We don't need the reset flag anymore after the fork. It has 3004 * We don't need the reset flag anymore after the fork. It has
@@ -2869,11 +3007,6 @@ void sched_fork(struct task_struct *p)
2869 p->sched_reset_on_fork = 0; 3007 p->sched_reset_on_fork = 0;
2870 } 3008 }
2871 3009
2872 /*
2873 * Make sure we do not leak PI boosting priority to the child.
2874 */
2875 p->prio = current->normal_prio;
2876
2877 if (!rt_prio(p->prio)) 3010 if (!rt_prio(p->prio))
2878 p->sched_class = &fair_sched_class; 3011 p->sched_class = &fair_sched_class;
2879 3012
@@ -4116,7 +4249,7 @@ void scheduler_tick(void)
4116 perf_event_task_tick(); 4249 perf_event_task_tick();
4117 4250
4118#ifdef CONFIG_SMP 4251#ifdef CONFIG_SMP
4119 rq->idle_at_tick = idle_cpu(cpu); 4252 rq->idle_balance = idle_cpu(cpu);
4120 trigger_load_balance(rq, cpu); 4253 trigger_load_balance(rq, cpu);
4121#endif 4254#endif
4122} 4255}
@@ -4240,7 +4373,7 @@ pick_next_task(struct rq *rq)
4240 * Optimization: we know that if all tasks are in 4373 * Optimization: we know that if all tasks are in
4241 * the fair class we can call that function directly: 4374 * the fair class we can call that function directly:
4242 */ 4375 */
4243 if (likely(rq->nr_running == rq->cfs.nr_running)) { 4376 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
4244 p = fair_sched_class.pick_next_task(rq); 4377 p = fair_sched_class.pick_next_task(rq);
4245 if (likely(p)) 4378 if (likely(p))
4246 return p; 4379 return p;
@@ -5026,7 +5159,20 @@ EXPORT_SYMBOL(task_nice);
5026 */ 5159 */
5027int idle_cpu(int cpu) 5160int idle_cpu(int cpu)
5028{ 5161{
5029 return cpu_curr(cpu) == cpu_rq(cpu)->idle; 5162 struct rq *rq = cpu_rq(cpu);
5163
5164 if (rq->curr != rq->idle)
5165 return 0;
5166
5167 if (rq->nr_running)
5168 return 0;
5169
5170#ifdef CONFIG_SMP
5171 if (!llist_empty(&rq->wake_list))
5172 return 0;
5173#endif
5174
5175 return 1;
5030} 5176}
5031 5177
5032/** 5178/**
@@ -5876,7 +6022,7 @@ void show_state_filter(unsigned long state_filter)
5876 printk(KERN_INFO 6022 printk(KERN_INFO
5877 " task PC stack pid father\n"); 6023 " task PC stack pid father\n");
5878#endif 6024#endif
5879 read_lock(&tasklist_lock); 6025 rcu_read_lock();
5880 do_each_thread(g, p) { 6026 do_each_thread(g, p) {
5881 /* 6027 /*
5882 * reset the NMI-timeout, listing all files on a slow 6028 * reset the NMI-timeout, listing all files on a slow
@@ -5892,7 +6038,7 @@ void show_state_filter(unsigned long state_filter)
5892#ifdef CONFIG_SCHED_DEBUG 6038#ifdef CONFIG_SCHED_DEBUG
5893 sysrq_sched_debug_show(); 6039 sysrq_sched_debug_show();
5894#endif 6040#endif
5895 read_unlock(&tasklist_lock); 6041 rcu_read_unlock();
5896 /* 6042 /*
5897 * Only show locks if all tasks are dumped: 6043 * Only show locks if all tasks are dumped:
5898 */ 6044 */
@@ -6007,10 +6153,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
6007{ 6153{
6008 if (p->sched_class && p->sched_class->set_cpus_allowed) 6154 if (p->sched_class && p->sched_class->set_cpus_allowed)
6009 p->sched_class->set_cpus_allowed(p, new_mask); 6155 p->sched_class->set_cpus_allowed(p, new_mask);
6010 else { 6156
6011 cpumask_copy(&p->cpus_allowed, new_mask); 6157 cpumask_copy(&p->cpus_allowed, new_mask);
6012 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 6158 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
6013 }
6014} 6159}
6015 6160
6016/* 6161/*
@@ -6108,7 +6253,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6108 if (task_cpu(p) != src_cpu) 6253 if (task_cpu(p) != src_cpu)
6109 goto done; 6254 goto done;
6110 /* Affinity changed (again). */ 6255 /* Affinity changed (again). */
6111 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 6256 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
6112 goto fail; 6257 goto fail;
6113 6258
6114 /* 6259 /*
@@ -6189,6 +6334,30 @@ static void calc_global_load_remove(struct rq *rq)
6189 rq->calc_load_active = 0; 6334 rq->calc_load_active = 0;
6190} 6335}
6191 6336
6337#ifdef CONFIG_CFS_BANDWIDTH
6338static void unthrottle_offline_cfs_rqs(struct rq *rq)
6339{
6340 struct cfs_rq *cfs_rq;
6341
6342 for_each_leaf_cfs_rq(rq, cfs_rq) {
6343 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6344
6345 if (!cfs_rq->runtime_enabled)
6346 continue;
6347
6348 /*
6349 * clock_task is not advancing so we just need to make sure
6350 * there's some valid quota amount
6351 */
6352 cfs_rq->runtime_remaining = cfs_b->quota;
6353 if (cfs_rq_throttled(cfs_rq))
6354 unthrottle_cfs_rq(cfs_rq);
6355 }
6356}
6357#else
6358static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6359#endif
6360
6192/* 6361/*
6193 * Migrate all tasks from the rq, sleeping tasks will be migrated by 6362 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6194 * try_to_wake_up()->select_task_rq(). 6363 * try_to_wake_up()->select_task_rq().
@@ -6214,6 +6383,9 @@ static void migrate_tasks(unsigned int dead_cpu)
6214 */ 6383 */
6215 rq->stop = NULL; 6384 rq->stop = NULL;
6216 6385
6386 /* Ensure any throttled groups are reachable by pick_next_task */
6387 unthrottle_offline_cfs_rqs(rq);
6388
6217 for ( ; ; ) { 6389 for ( ; ; ) {
6218 /* 6390 /*
6219 * There's this thread running, bail when that's the only 6391 * There's this thread running, bail when that's the only
@@ -7957,6 +8129,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7957 /* allow initial update_cfs_load() to truncate */ 8129 /* allow initial update_cfs_load() to truncate */
7958 cfs_rq->load_stamp = 1; 8130 cfs_rq->load_stamp = 1;
7959#endif 8131#endif
8132 init_cfs_rq_runtime(cfs_rq);
7960 8133
7961 tg->cfs_rq[cpu] = cfs_rq; 8134 tg->cfs_rq[cpu] = cfs_rq;
7962 tg->se[cpu] = se; 8135 tg->se[cpu] = se;
@@ -8096,6 +8269,7 @@ void __init sched_init(void)
8096 * We achieve this by letting root_task_group's tasks sit 8269 * We achieve this by letting root_task_group's tasks sit
8097 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 8270 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
8098 */ 8271 */
8272 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
8099 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 8273 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8100#endif /* CONFIG_FAIR_GROUP_SCHED */ 8274#endif /* CONFIG_FAIR_GROUP_SCHED */
8101 8275
@@ -8125,7 +8299,6 @@ void __init sched_init(void)
8125 rq_attach_root(rq, &def_root_domain); 8299 rq_attach_root(rq, &def_root_domain);
8126#ifdef CONFIG_NO_HZ 8300#ifdef CONFIG_NO_HZ
8127 rq->nohz_balance_kick = 0; 8301 rq->nohz_balance_kick = 0;
8128 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
8129#endif 8302#endif
8130#endif 8303#endif
8131 init_rq_hrtick(rq); 8304 init_rq_hrtick(rq);
@@ -8336,6 +8509,8 @@ static void free_fair_sched_group(struct task_group *tg)
8336{ 8509{
8337 int i; 8510 int i;
8338 8511
8512 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8513
8339 for_each_possible_cpu(i) { 8514 for_each_possible_cpu(i) {
8340 if (tg->cfs_rq) 8515 if (tg->cfs_rq)
8341 kfree(tg->cfs_rq[i]); 8516 kfree(tg->cfs_rq[i]);
@@ -8363,6 +8538,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8363 8538
8364 tg->shares = NICE_0_LOAD; 8539 tg->shares = NICE_0_LOAD;
8365 8540
8541 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8542
8366 for_each_possible_cpu(i) { 8543 for_each_possible_cpu(i) {
8367 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8544 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8368 GFP_KERNEL, cpu_to_node(i)); 8545 GFP_KERNEL, cpu_to_node(i));
@@ -8638,12 +8815,7 @@ unsigned long sched_group_shares(struct task_group *tg)
8638} 8815}
8639#endif 8816#endif
8640 8817
8641#ifdef CONFIG_RT_GROUP_SCHED 8818#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
8642/*
8643 * Ensure that the real time constraints are schedulable.
8644 */
8645static DEFINE_MUTEX(rt_constraints_mutex);
8646
8647static unsigned long to_ratio(u64 period, u64 runtime) 8819static unsigned long to_ratio(u64 period, u64 runtime)
8648{ 8820{
8649 if (runtime == RUNTIME_INF) 8821 if (runtime == RUNTIME_INF)
@@ -8651,6 +8823,13 @@ static unsigned long to_ratio(u64 period, u64 runtime)
8651 8823
8652 return div64_u64(runtime << 20, period); 8824 return div64_u64(runtime << 20, period);
8653} 8825}
8826#endif
8827
8828#ifdef CONFIG_RT_GROUP_SCHED
8829/*
8830 * Ensure that the real time constraints are schedulable.
8831 */
8832static DEFINE_MUTEX(rt_constraints_mutex);
8654 8833
8655/* Must be called with tasklist_lock held */ 8834/* Must be called with tasklist_lock held */
8656static inline int tg_has_rt_tasks(struct task_group *tg) 8835static inline int tg_has_rt_tasks(struct task_group *tg)
@@ -8671,7 +8850,7 @@ struct rt_schedulable_data {
8671 u64 rt_runtime; 8850 u64 rt_runtime;
8672}; 8851};
8673 8852
8674static int tg_schedulable(struct task_group *tg, void *data) 8853static int tg_rt_schedulable(struct task_group *tg, void *data)
8675{ 8854{
8676 struct rt_schedulable_data *d = data; 8855 struct rt_schedulable_data *d = data;
8677 struct task_group *child; 8856 struct task_group *child;
@@ -8729,16 +8908,22 @@ static int tg_schedulable(struct task_group *tg, void *data)
8729 8908
8730static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8909static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8731{ 8910{
8911 int ret;
8912
8732 struct rt_schedulable_data data = { 8913 struct rt_schedulable_data data = {
8733 .tg = tg, 8914 .tg = tg,
8734 .rt_period = period, 8915 .rt_period = period,
8735 .rt_runtime = runtime, 8916 .rt_runtime = runtime,
8736 }; 8917 };
8737 8918
8738 return walk_tg_tree(tg_schedulable, tg_nop, &data); 8919 rcu_read_lock();
8920 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
8921 rcu_read_unlock();
8922
8923 return ret;
8739} 8924}
8740 8925
8741static int tg_set_bandwidth(struct task_group *tg, 8926static int tg_set_rt_bandwidth(struct task_group *tg,
8742 u64 rt_period, u64 rt_runtime) 8927 u64 rt_period, u64 rt_runtime)
8743{ 8928{
8744 int i, err = 0; 8929 int i, err = 0;
@@ -8777,7 +8962,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8777 if (rt_runtime_us < 0) 8962 if (rt_runtime_us < 0)
8778 rt_runtime = RUNTIME_INF; 8963 rt_runtime = RUNTIME_INF;
8779 8964
8780 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8965 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8781} 8966}
8782 8967
8783long sched_group_rt_runtime(struct task_group *tg) 8968long sched_group_rt_runtime(struct task_group *tg)
@@ -8802,7 +8987,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8802 if (rt_period == 0) 8987 if (rt_period == 0)
8803 return -EINVAL; 8988 return -EINVAL;
8804 8989
8805 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8990 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8806} 8991}
8807 8992
8808long sched_group_rt_period(struct task_group *tg) 8993long sched_group_rt_period(struct task_group *tg)
@@ -8992,6 +9177,238 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8992 9177
8993 return (u64) scale_load_down(tg->shares); 9178 return (u64) scale_load_down(tg->shares);
8994} 9179}
9180
9181#ifdef CONFIG_CFS_BANDWIDTH
9182static DEFINE_MUTEX(cfs_constraints_mutex);
9183
9184const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
9185const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
9186
9187static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9188
9189static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9190{
9191 int i, ret = 0, runtime_enabled;
9192 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9193
9194 if (tg == &root_task_group)
9195 return -EINVAL;
9196
9197 /*
9198 * Ensure we have at some amount of bandwidth every period. This is
9199 * to prevent reaching a state of large arrears when throttled via
9200 * entity_tick() resulting in prolonged exit starvation.
9201 */
9202 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
9203 return -EINVAL;
9204
9205 /*
9206 * Likewise, bound things on the otherside by preventing insane quota
9207 * periods. This also allows us to normalize in computing quota
9208 * feasibility.
9209 */
9210 if (period > max_cfs_quota_period)
9211 return -EINVAL;
9212
9213 mutex_lock(&cfs_constraints_mutex);
9214 ret = __cfs_schedulable(tg, period, quota);
9215 if (ret)
9216 goto out_unlock;
9217
9218 runtime_enabled = quota != RUNTIME_INF;
9219 raw_spin_lock_irq(&cfs_b->lock);
9220 cfs_b->period = ns_to_ktime(period);
9221 cfs_b->quota = quota;
9222
9223 __refill_cfs_bandwidth_runtime(cfs_b);
9224 /* restart the period timer (if active) to handle new period expiry */
9225 if (runtime_enabled && cfs_b->timer_active) {
9226 /* force a reprogram */
9227 cfs_b->timer_active = 0;
9228 __start_cfs_bandwidth(cfs_b);
9229 }
9230 raw_spin_unlock_irq(&cfs_b->lock);
9231
9232 for_each_possible_cpu(i) {
9233 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9234 struct rq *rq = rq_of(cfs_rq);
9235
9236 raw_spin_lock_irq(&rq->lock);
9237 cfs_rq->runtime_enabled = runtime_enabled;
9238 cfs_rq->runtime_remaining = 0;
9239
9240 if (cfs_rq_throttled(cfs_rq))
9241 unthrottle_cfs_rq(cfs_rq);
9242 raw_spin_unlock_irq(&rq->lock);
9243 }
9244out_unlock:
9245 mutex_unlock(&cfs_constraints_mutex);
9246
9247 return ret;
9248}
9249
9250int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9251{
9252 u64 quota, period;
9253
9254 period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
9255 if (cfs_quota_us < 0)
9256 quota = RUNTIME_INF;
9257 else
9258 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
9259
9260 return tg_set_cfs_bandwidth(tg, period, quota);
9261}
9262
9263long tg_get_cfs_quota(struct task_group *tg)
9264{
9265 u64 quota_us;
9266
9267 if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
9268 return -1;
9269
9270 quota_us = tg_cfs_bandwidth(tg)->quota;
9271 do_div(quota_us, NSEC_PER_USEC);
9272
9273 return quota_us;
9274}
9275
9276int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9277{
9278 u64 quota, period;
9279
9280 period = (u64)cfs_period_us * NSEC_PER_USEC;
9281 quota = tg_cfs_bandwidth(tg)->quota;
9282
9283 if (period <= 0)
9284 return -EINVAL;
9285
9286 return tg_set_cfs_bandwidth(tg, period, quota);
9287}
9288
9289long tg_get_cfs_period(struct task_group *tg)
9290{
9291 u64 cfs_period_us;
9292
9293 cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
9294 do_div(cfs_period_us, NSEC_PER_USEC);
9295
9296 return cfs_period_us;
9297}
9298
9299static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
9300{
9301 return tg_get_cfs_quota(cgroup_tg(cgrp));
9302}
9303
9304static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
9305 s64 cfs_quota_us)
9306{
9307 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
9308}
9309
9310static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
9311{
9312 return tg_get_cfs_period(cgroup_tg(cgrp));
9313}
9314
9315static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9316 u64 cfs_period_us)
9317{
9318 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
9319}
9320
9321struct cfs_schedulable_data {
9322 struct task_group *tg;
9323 u64 period, quota;
9324};
9325
9326/*
9327 * normalize group quota/period to be quota/max_period
9328 * note: units are usecs
9329 */
9330static u64 normalize_cfs_quota(struct task_group *tg,
9331 struct cfs_schedulable_data *d)
9332{
9333 u64 quota, period;
9334
9335 if (tg == d->tg) {
9336 period = d->period;
9337 quota = d->quota;
9338 } else {
9339 period = tg_get_cfs_period(tg);
9340 quota = tg_get_cfs_quota(tg);
9341 }
9342
9343 /* note: these should typically be equivalent */
9344 if (quota == RUNTIME_INF || quota == -1)
9345 return RUNTIME_INF;
9346
9347 return to_ratio(period, quota);
9348}
9349
9350static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9351{
9352 struct cfs_schedulable_data *d = data;
9353 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9354 s64 quota = 0, parent_quota = -1;
9355
9356 if (!tg->parent) {
9357 quota = RUNTIME_INF;
9358 } else {
9359 struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
9360
9361 quota = normalize_cfs_quota(tg, d);
9362 parent_quota = parent_b->hierarchal_quota;
9363
9364 /*
9365 * ensure max(child_quota) <= parent_quota, inherit when no
9366 * limit is set
9367 */
9368 if (quota == RUNTIME_INF)
9369 quota = parent_quota;
9370 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
9371 return -EINVAL;
9372 }
9373 cfs_b->hierarchal_quota = quota;
9374
9375 return 0;
9376}
9377
9378static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
9379{
9380 int ret;
9381 struct cfs_schedulable_data data = {
9382 .tg = tg,
9383 .period = period,
9384 .quota = quota,
9385 };
9386
9387 if (quota != RUNTIME_INF) {
9388 do_div(data.period, NSEC_PER_USEC);
9389 do_div(data.quota, NSEC_PER_USEC);
9390 }
9391
9392 rcu_read_lock();
9393 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
9394 rcu_read_unlock();
9395
9396 return ret;
9397}
9398
9399static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
9400 struct cgroup_map_cb *cb)
9401{
9402 struct task_group *tg = cgroup_tg(cgrp);
9403 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9404
9405 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
9406 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
9407 cb->fill(cb, "throttled_time", cfs_b->throttled_time);
9408
9409 return 0;
9410}
9411#endif /* CONFIG_CFS_BANDWIDTH */
8995#endif /* CONFIG_FAIR_GROUP_SCHED */ 9412#endif /* CONFIG_FAIR_GROUP_SCHED */
8996 9413
8997#ifdef CONFIG_RT_GROUP_SCHED 9414#ifdef CONFIG_RT_GROUP_SCHED
@@ -9026,6 +9443,22 @@ static struct cftype cpu_files[] = {
9026 .write_u64 = cpu_shares_write_u64, 9443 .write_u64 = cpu_shares_write_u64,
9027 }, 9444 },
9028#endif 9445#endif
9446#ifdef CONFIG_CFS_BANDWIDTH
9447 {
9448 .name = "cfs_quota_us",
9449 .read_s64 = cpu_cfs_quota_read_s64,
9450 .write_s64 = cpu_cfs_quota_write_s64,
9451 },
9452 {
9453 .name = "cfs_period_us",
9454 .read_u64 = cpu_cfs_period_read_u64,
9455 .write_u64 = cpu_cfs_period_write_u64,
9456 },
9457 {
9458 .name = "stat",
9459 .read_map = cpu_stats_show,
9460 },
9461#endif
9029#ifdef CONFIG_RT_GROUP_SCHED 9462#ifdef CONFIG_RT_GROUP_SCHED
9030 { 9463 {
9031 .name = "rt_runtime_us", 9464 .name = "rt_runtime_us",
@@ -9335,4 +9768,3 @@ struct cgroup_subsys cpuacct_subsys = {
9335 .subsys_id = cpuacct_subsys_id, 9768 .subsys_id = cpuacct_subsys_id,
9336}; 9769};
9337#endif /* CONFIG_CGROUP_CPUACCT */ 9770#endif /* CONFIG_CGROUP_CPUACCT */
9338
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 2722dc1b4138..a86cf9d9eb11 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,6 @@ static int convert_prio(int prio)
47 return cpupri; 47 return cpupri;
48} 48}
49 49
50#define for_each_cpupri_active(array, idx) \
51 for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
52
53/** 50/**
54 * cpupri_find - find the best (lowest-pri) CPU in the system 51 * cpupri_find - find the best (lowest-pri) CPU in the system
55 * @cp: The cpupri context 52 * @cp: The cpupri context
@@ -71,11 +68,38 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
71 int idx = 0; 68 int idx = 0;
72 int task_pri = convert_prio(p->prio); 69 int task_pri = convert_prio(p->prio);
73 70
74 for_each_cpupri_active(cp->pri_active, idx) { 71 if (task_pri >= MAX_RT_PRIO)
75 struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 72 return 0;
76 73
77 if (idx >= task_pri) 74 for (idx = 0; idx < task_pri; idx++) {
78 break; 75 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
76 int skip = 0;
77
78 if (!atomic_read(&(vec)->count))
79 skip = 1;
80 /*
81 * When looking at the vector, we need to read the counter,
82 * do a memory barrier, then read the mask.
83 *
84 * Note: This is still all racey, but we can deal with it.
85 * Ideally, we only want to look at masks that are set.
86 *
87 * If a mask is not set, then the only thing wrong is that we
88 * did a little more work than necessary.
89 *
90 * If we read a zero count but the mask is set, because of the
91 * memory barriers, that can only happen when the highest prio
92 * task for a run queue has left the run queue, in which case,
93 * it will be followed by a pull. If the task we are processing
94 * fails to find a proper place to go, that pull request will
95 * pull this task if the run queue is running at a lower
96 * priority.
97 */
98 smp_rmb();
99
100 /* Need to do the rmb for every iteration */
101 if (skip)
102 continue;
79 103
80 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 104 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
81 continue; 105 continue;
@@ -115,7 +139,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
115{ 139{
116 int *currpri = &cp->cpu_to_pri[cpu]; 140 int *currpri = &cp->cpu_to_pri[cpu];
117 int oldpri = *currpri; 141 int oldpri = *currpri;
118 unsigned long flags; 142 int do_mb = 0;
119 143
120 newpri = convert_prio(newpri); 144 newpri = convert_prio(newpri);
121 145
@@ -128,32 +152,46 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
128 * If the cpu was currently mapped to a different value, we 152 * If the cpu was currently mapped to a different value, we
129 * need to map it to the new value then remove the old value. 153 * need to map it to the new value then remove the old value.
130 * Note, we must add the new value first, otherwise we risk the 154 * Note, we must add the new value first, otherwise we risk the
131 * cpu being cleared from pri_active, and this cpu could be 155 * cpu being missed by the priority loop in cpupri_find.
132 * missed for a push or pull.
133 */ 156 */
134 if (likely(newpri != CPUPRI_INVALID)) { 157 if (likely(newpri != CPUPRI_INVALID)) {
135 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 158 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
136 159
137 raw_spin_lock_irqsave(&vec->lock, flags);
138
139 cpumask_set_cpu(cpu, vec->mask); 160 cpumask_set_cpu(cpu, vec->mask);
140 vec->count++; 161 /*
141 if (vec->count == 1) 162 * When adding a new vector, we update the mask first,
142 set_bit(newpri, cp->pri_active); 163 * do a write memory barrier, and then update the count, to
143 164 * make sure the vector is visible when count is set.
144 raw_spin_unlock_irqrestore(&vec->lock, flags); 165 */
166 smp_mb__before_atomic_inc();
167 atomic_inc(&(vec)->count);
168 do_mb = 1;
145 } 169 }
146 if (likely(oldpri != CPUPRI_INVALID)) { 170 if (likely(oldpri != CPUPRI_INVALID)) {
147 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; 171 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
148 172
149 raw_spin_lock_irqsave(&vec->lock, flags); 173 /*
150 174 * Because the order of modification of the vec->count
151 vec->count--; 175 * is important, we must make sure that the update
152 if (!vec->count) 176 * of the new prio is seen before we decrement the
153 clear_bit(oldpri, cp->pri_active); 177 * old prio. This makes sure that the loop sees
178 * one or the other when we raise the priority of
179 * the run queue. We don't care about when we lower the
180 * priority, as that will trigger an rt pull anyway.
181 *
182 * We only need to do a memory barrier if we updated
183 * the new priority vec.
184 */
185 if (do_mb)
186 smp_mb__after_atomic_inc();
187
188 /*
189 * When removing from the vector, we decrement the counter first
190 * do a memory barrier and then clear the mask.
191 */
192 atomic_dec(&(vec)->count);
193 smp_mb__after_atomic_inc();
154 cpumask_clear_cpu(cpu, vec->mask); 194 cpumask_clear_cpu(cpu, vec->mask);
155
156 raw_spin_unlock_irqrestore(&vec->lock, flags);
157 } 195 }
158 196
159 *currpri = newpri; 197 *currpri = newpri;
@@ -175,8 +213,7 @@ int cpupri_init(struct cpupri *cp)
175 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 213 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
176 struct cpupri_vec *vec = &cp->pri_to_cpu[i]; 214 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
177 215
178 raw_spin_lock_init(&vec->lock); 216 atomic_set(&vec->count, 0);
179 vec->count = 0;
180 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) 217 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
181 goto cleanup; 218 goto cleanup;
182 } 219 }
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 9fc7d386fea4..f6d756173491 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -4,7 +4,6 @@
4#include <linux/sched.h> 4#include <linux/sched.h>
5 5
6#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) 6#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
7#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
8 7
9#define CPUPRI_INVALID -1 8#define CPUPRI_INVALID -1
10#define CPUPRI_IDLE 0 9#define CPUPRI_IDLE 0
@@ -12,14 +11,12 @@
12/* values 2-101 are RT priorities 0-99 */ 11/* values 2-101 are RT priorities 0-99 */
13 12
14struct cpupri_vec { 13struct cpupri_vec {
15 raw_spinlock_t lock; 14 atomic_t count;
16 int count; 15 cpumask_var_t mask;
17 cpumask_var_t mask;
18}; 16};
19 17
20struct cpupri { 18struct cpupri {
21 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; 19 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
22 long pri_active[CPUPRI_NR_PRI_WORDS];
23 int cpu_to_pri[NR_CPUS]; 20 int cpu_to_pri[NR_CPUS];
24}; 21};
25 22
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bc8ee9993814..5c9e67923b7c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
89 */ 89 */
90unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; 90unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
91 91
92#ifdef CONFIG_CFS_BANDWIDTH
93/*
94 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
95 * each time a cfs_rq requests quota.
96 *
97 * Note: in the case that the slice exceeds the runtime remaining (either due
98 * to consumption or the quota being specified to be smaller than the slice)
99 * we will always only issue the remaining available time.
100 *
101 * default: 5 msec, units: microseconds
102 */
103unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
104#endif
105
92static const struct sched_class fair_sched_class; 106static const struct sched_class fair_sched_class;
93 107
94/************************************************************** 108/**************************************************************
@@ -292,6 +306,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
292 306
293#endif /* CONFIG_FAIR_GROUP_SCHED */ 307#endif /* CONFIG_FAIR_GROUP_SCHED */
294 308
309static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
310 unsigned long delta_exec);
295 311
296/************************************************************** 312/**************************************************************
297 * Scheduling class tree data structure manipulation methods: 313 * Scheduling class tree data structure manipulation methods:
@@ -583,6 +599,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
583 cpuacct_charge(curtask, delta_exec); 599 cpuacct_charge(curtask, delta_exec);
584 account_group_exec_runtime(curtask, delta_exec); 600 account_group_exec_runtime(curtask, delta_exec);
585 } 601 }
602
603 account_cfs_rq_runtime(cfs_rq, delta_exec);
586} 604}
587 605
588static inline void 606static inline void
@@ -688,6 +706,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
688} 706}
689 707
690#ifdef CONFIG_FAIR_GROUP_SCHED 708#ifdef CONFIG_FAIR_GROUP_SCHED
709/* we need this in update_cfs_load and load-balance functions below */
710static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
691# ifdef CONFIG_SMP 711# ifdef CONFIG_SMP
692static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, 712static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
693 int global_update) 713 int global_update)
@@ -710,7 +730,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
710 u64 now, delta; 730 u64 now, delta;
711 unsigned long load = cfs_rq->load.weight; 731 unsigned long load = cfs_rq->load.weight;
712 732
713 if (cfs_rq->tg == &root_task_group) 733 if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
714 return; 734 return;
715 735
716 now = rq_of(cfs_rq)->clock_task; 736 now = rq_of(cfs_rq)->clock_task;
@@ -819,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
819 839
820 tg = cfs_rq->tg; 840 tg = cfs_rq->tg;
821 se = tg->se[cpu_of(rq_of(cfs_rq))]; 841 se = tg->se[cpu_of(rq_of(cfs_rq))];
822 if (!se) 842 if (!se || throttled_hierarchy(cfs_rq))
823 return; 843 return;
824#ifndef CONFIG_SMP 844#ifndef CONFIG_SMP
825 if (likely(se->load.weight == tg->shares)) 845 if (likely(se->load.weight == tg->shares))
@@ -950,6 +970,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
950 se->vruntime = vruntime; 970 se->vruntime = vruntime;
951} 971}
952 972
973static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
974
953static void 975static void
954enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 976enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
955{ 977{
@@ -979,8 +1001,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
979 __enqueue_entity(cfs_rq, se); 1001 __enqueue_entity(cfs_rq, se);
980 se->on_rq = 1; 1002 se->on_rq = 1;
981 1003
982 if (cfs_rq->nr_running == 1) 1004 if (cfs_rq->nr_running == 1) {
983 list_add_leaf_cfs_rq(cfs_rq); 1005 list_add_leaf_cfs_rq(cfs_rq);
1006 check_enqueue_throttle(cfs_rq);
1007 }
984} 1008}
985 1009
986static void __clear_buddies_last(struct sched_entity *se) 1010static void __clear_buddies_last(struct sched_entity *se)
@@ -1028,6 +1052,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1028 __clear_buddies_skip(se); 1052 __clear_buddies_skip(se);
1029} 1053}
1030 1054
1055static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1056
1031static void 1057static void
1032dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 1058dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1033{ 1059{
@@ -1066,6 +1092,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1066 if (!(flags & DEQUEUE_SLEEP)) 1092 if (!(flags & DEQUEUE_SLEEP))
1067 se->vruntime -= cfs_rq->min_vruntime; 1093 se->vruntime -= cfs_rq->min_vruntime;
1068 1094
1095 /* return excess runtime on last dequeue */
1096 return_cfs_rq_runtime(cfs_rq);
1097
1069 update_min_vruntime(cfs_rq); 1098 update_min_vruntime(cfs_rq);
1070 update_cfs_shares(cfs_rq); 1099 update_cfs_shares(cfs_rq);
1071} 1100}
@@ -1077,6 +1106,8 @@ static void
1077check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 1106check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1078{ 1107{
1079 unsigned long ideal_runtime, delta_exec; 1108 unsigned long ideal_runtime, delta_exec;
1109 struct sched_entity *se;
1110 s64 delta;
1080 1111
1081 ideal_runtime = sched_slice(cfs_rq, curr); 1112 ideal_runtime = sched_slice(cfs_rq, curr);
1082 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 1113 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
@@ -1095,22 +1126,17 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1095 * narrow margin doesn't have to wait for a full slice. 1126 * narrow margin doesn't have to wait for a full slice.
1096 * This also mitigates buddy induced latencies under load. 1127 * This also mitigates buddy induced latencies under load.
1097 */ 1128 */
1098 if (!sched_feat(WAKEUP_PREEMPT))
1099 return;
1100
1101 if (delta_exec < sysctl_sched_min_granularity) 1129 if (delta_exec < sysctl_sched_min_granularity)
1102 return; 1130 return;
1103 1131
1104 if (cfs_rq->nr_running > 1) { 1132 se = __pick_first_entity(cfs_rq);
1105 struct sched_entity *se = __pick_first_entity(cfs_rq); 1133 delta = curr->vruntime - se->vruntime;
1106 s64 delta = curr->vruntime - se->vruntime;
1107 1134
1108 if (delta < 0) 1135 if (delta < 0)
1109 return; 1136 return;
1110 1137
1111 if (delta > ideal_runtime) 1138 if (delta > ideal_runtime)
1112 resched_task(rq_of(cfs_rq)->curr); 1139 resched_task(rq_of(cfs_rq)->curr);
1113 }
1114} 1140}
1115 1141
1116static void 1142static void
@@ -1185,6 +1211,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1185 return se; 1211 return se;
1186} 1212}
1187 1213
1214static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1215
1188static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) 1216static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1189{ 1217{
1190 /* 1218 /*
@@ -1194,6 +1222,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1194 if (prev->on_rq) 1222 if (prev->on_rq)
1195 update_curr(cfs_rq); 1223 update_curr(cfs_rq);
1196 1224
1225 /* throttle cfs_rqs exceeding runtime */
1226 check_cfs_rq_runtime(cfs_rq);
1227
1197 check_spread(cfs_rq, prev); 1228 check_spread(cfs_rq, prev);
1198 if (prev->on_rq) { 1229 if (prev->on_rq) {
1199 update_stats_wait_start(cfs_rq, prev); 1230 update_stats_wait_start(cfs_rq, prev);
@@ -1233,10 +1264,583 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1233 return; 1264 return;
1234#endif 1265#endif
1235 1266
1236 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) 1267 if (cfs_rq->nr_running > 1)
1237 check_preempt_tick(cfs_rq, curr); 1268 check_preempt_tick(cfs_rq, curr);
1238} 1269}
1239 1270
1271
1272/**************************************************
1273 * CFS bandwidth control machinery
1274 */
1275
1276#ifdef CONFIG_CFS_BANDWIDTH
1277/*
1278 * default period for cfs group bandwidth.
1279 * default: 0.1s, units: nanoseconds
1280 */
1281static inline u64 default_cfs_period(void)
1282{
1283 return 100000000ULL;
1284}
1285
1286static inline u64 sched_cfs_bandwidth_slice(void)
1287{
1288 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
1289}
1290
1291/*
1292 * Replenish runtime according to assigned quota and update expiration time.
1293 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
1294 * additional synchronization around rq->lock.
1295 *
1296 * requires cfs_b->lock
1297 */
1298static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1299{
1300 u64 now;
1301
1302 if (cfs_b->quota == RUNTIME_INF)
1303 return;
1304
1305 now = sched_clock_cpu(smp_processor_id());
1306 cfs_b->runtime = cfs_b->quota;
1307 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1308}
1309
1310/* returns 0 on failure to allocate runtime */
1311static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1312{
1313 struct task_group *tg = cfs_rq->tg;
1314 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
1315 u64 amount = 0, min_amount, expires;
1316
1317 /* note: this is a positive sum as runtime_remaining <= 0 */
1318 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
1319
1320 raw_spin_lock(&cfs_b->lock);
1321 if (cfs_b->quota == RUNTIME_INF)
1322 amount = min_amount;
1323 else {
1324 /*
1325 * If the bandwidth pool has become inactive, then at least one
1326 * period must have elapsed since the last consumption.
1327 * Refresh the global state and ensure bandwidth timer becomes
1328 * active.
1329 */
1330 if (!cfs_b->timer_active) {
1331 __refill_cfs_bandwidth_runtime(cfs_b);
1332 __start_cfs_bandwidth(cfs_b);
1333 }
1334
1335 if (cfs_b->runtime > 0) {
1336 amount = min(cfs_b->runtime, min_amount);
1337 cfs_b->runtime -= amount;
1338 cfs_b->idle = 0;
1339 }
1340 }
1341 expires = cfs_b->runtime_expires;
1342 raw_spin_unlock(&cfs_b->lock);
1343
1344 cfs_rq->runtime_remaining += amount;
1345 /*
1346 * we may have advanced our local expiration to account for allowed
1347 * spread between our sched_clock and the one on which runtime was
1348 * issued.
1349 */
1350 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
1351 cfs_rq->runtime_expires = expires;
1352
1353 return cfs_rq->runtime_remaining > 0;
1354}
1355
1356/*
1357 * Note: This depends on the synchronization provided by sched_clock and the
1358 * fact that rq->clock snapshots this value.
1359 */
1360static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1361{
1362 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1363 struct rq *rq = rq_of(cfs_rq);
1364
1365 /* if the deadline is ahead of our clock, nothing to do */
1366 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
1367 return;
1368
1369 if (cfs_rq->runtime_remaining < 0)
1370 return;
1371
1372 /*
1373 * If the local deadline has passed we have to consider the
1374 * possibility that our sched_clock is 'fast' and the global deadline
1375 * has not truly expired.
1376 *
1377 * Fortunately we can check determine whether this the case by checking
1378 * whether the global deadline has advanced.
1379 */
1380
1381 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
1382 /* extend local deadline, drift is bounded above by 2 ticks */
1383 cfs_rq->runtime_expires += TICK_NSEC;
1384 } else {
1385 /* global deadline is ahead, expiration has passed */
1386 cfs_rq->runtime_remaining = 0;
1387 }
1388}
1389
1390static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1391 unsigned long delta_exec)
1392{
1393 /* dock delta_exec before expiring quota (as it could span periods) */
1394 cfs_rq->runtime_remaining -= delta_exec;
1395 expire_cfs_rq_runtime(cfs_rq);
1396
1397 if (likely(cfs_rq->runtime_remaining > 0))
1398 return;
1399
1400 /*
1401 * if we're unable to extend our runtime we resched so that the active
1402 * hierarchy can be throttled
1403 */
1404 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
1405 resched_task(rq_of(cfs_rq)->curr);
1406}
1407
1408static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1409 unsigned long delta_exec)
1410{
1411 if (!cfs_rq->runtime_enabled)
1412 return;
1413
1414 __account_cfs_rq_runtime(cfs_rq, delta_exec);
1415}
1416
1417static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1418{
1419 return cfs_rq->throttled;
1420}
1421
1422/* check whether cfs_rq, or any parent, is throttled */
1423static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1424{
1425 return cfs_rq->throttle_count;
1426}
1427
1428/*
1429 * Ensure that neither of the group entities corresponding to src_cpu or
1430 * dest_cpu are members of a throttled hierarchy when performing group
1431 * load-balance operations.
1432 */
1433static inline int throttled_lb_pair(struct task_group *tg,
1434 int src_cpu, int dest_cpu)
1435{
1436 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
1437
1438 src_cfs_rq = tg->cfs_rq[src_cpu];
1439 dest_cfs_rq = tg->cfs_rq[dest_cpu];
1440
1441 return throttled_hierarchy(src_cfs_rq) ||
1442 throttled_hierarchy(dest_cfs_rq);
1443}
1444
1445/* updated child weight may affect parent so we have to do this bottom up */
1446static int tg_unthrottle_up(struct task_group *tg, void *data)
1447{
1448 struct rq *rq = data;
1449 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1450
1451 cfs_rq->throttle_count--;
1452#ifdef CONFIG_SMP
1453 if (!cfs_rq->throttle_count) {
1454 u64 delta = rq->clock_task - cfs_rq->load_stamp;
1455
1456 /* leaving throttled state, advance shares averaging windows */
1457 cfs_rq->load_stamp += delta;
1458 cfs_rq->load_last += delta;
1459
1460 /* update entity weight now that we are on_rq again */
1461 update_cfs_shares(cfs_rq);
1462 }
1463#endif
1464
1465 return 0;
1466}
1467
1468static int tg_throttle_down(struct task_group *tg, void *data)
1469{
1470 struct rq *rq = data;
1471 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1472
1473 /* group is entering throttled state, record last load */
1474 if (!cfs_rq->throttle_count)
1475 update_cfs_load(cfs_rq, 0);
1476 cfs_rq->throttle_count++;
1477
1478 return 0;
1479}
1480
1481static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1482{
1483 struct rq *rq = rq_of(cfs_rq);
1484 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1485 struct sched_entity *se;
1486 long task_delta, dequeue = 1;
1487
1488 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1489
1490 /* account load preceding throttle */
1491 rcu_read_lock();
1492 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
1493 rcu_read_unlock();
1494
1495 task_delta = cfs_rq->h_nr_running;
1496 for_each_sched_entity(se) {
1497 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
1498 /* throttled entity or throttle-on-deactivate */
1499 if (!se->on_rq)
1500 break;
1501
1502 if (dequeue)
1503 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
1504 qcfs_rq->h_nr_running -= task_delta;
1505
1506 if (qcfs_rq->load.weight)
1507 dequeue = 0;
1508 }
1509
1510 if (!se)
1511 rq->nr_running -= task_delta;
1512
1513 cfs_rq->throttled = 1;
1514 cfs_rq->throttled_timestamp = rq->clock;
1515 raw_spin_lock(&cfs_b->lock);
1516 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
1517 raw_spin_unlock(&cfs_b->lock);
1518}
1519
1520static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1521{
1522 struct rq *rq = rq_of(cfs_rq);
1523 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1524 struct sched_entity *se;
1525 int enqueue = 1;
1526 long task_delta;
1527
1528 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1529
1530 cfs_rq->throttled = 0;
1531 raw_spin_lock(&cfs_b->lock);
1532 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
1533 list_del_rcu(&cfs_rq->throttled_list);
1534 raw_spin_unlock(&cfs_b->lock);
1535 cfs_rq->throttled_timestamp = 0;
1536
1537 update_rq_clock(rq);
1538 /* update hierarchical throttle state */
1539 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
1540
1541 if (!cfs_rq->load.weight)
1542 return;
1543
1544 task_delta = cfs_rq->h_nr_running;
1545 for_each_sched_entity(se) {
1546 if (se->on_rq)
1547 enqueue = 0;
1548
1549 cfs_rq = cfs_rq_of(se);
1550 if (enqueue)
1551 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
1552 cfs_rq->h_nr_running += task_delta;
1553
1554 if (cfs_rq_throttled(cfs_rq))
1555 break;
1556 }
1557
1558 if (!se)
1559 rq->nr_running += task_delta;
1560
1561 /* determine whether we need to wake up potentially idle cpu */
1562 if (rq->curr == rq->idle && rq->cfs.nr_running)
1563 resched_task(rq->curr);
1564}
1565
1566static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
1567 u64 remaining, u64 expires)
1568{
1569 struct cfs_rq *cfs_rq;
1570 u64 runtime = remaining;
1571
1572 rcu_read_lock();
1573 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
1574 throttled_list) {
1575 struct rq *rq = rq_of(cfs_rq);
1576
1577 raw_spin_lock(&rq->lock);
1578 if (!cfs_rq_throttled(cfs_rq))
1579 goto next;
1580
1581 runtime = -cfs_rq->runtime_remaining + 1;
1582 if (runtime > remaining)
1583 runtime = remaining;
1584 remaining -= runtime;
1585
1586 cfs_rq->runtime_remaining += runtime;
1587 cfs_rq->runtime_expires = expires;
1588
1589 /* we check whether we're throttled above */
1590 if (cfs_rq->runtime_remaining > 0)
1591 unthrottle_cfs_rq(cfs_rq);
1592
1593next:
1594 raw_spin_unlock(&rq->lock);
1595
1596 if (!remaining)
1597 break;
1598 }
1599 rcu_read_unlock();
1600
1601 return remaining;
1602}
1603
1604/*
1605 * Responsible for refilling a task_group's bandwidth and unthrottling its
1606 * cfs_rqs as appropriate. If there has been no activity within the last
1607 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
1608 * used to track this state.
1609 */
1610static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
1611{
1612 u64 runtime, runtime_expires;
1613 int idle = 1, throttled;
1614
1615 raw_spin_lock(&cfs_b->lock);
1616 /* no need to continue the timer with no bandwidth constraint */
1617 if (cfs_b->quota == RUNTIME_INF)
1618 goto out_unlock;
1619
1620 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1621 /* idle depends on !throttled (for the case of a large deficit) */
1622 idle = cfs_b->idle && !throttled;
1623 cfs_b->nr_periods += overrun;
1624
1625 /* if we're going inactive then everything else can be deferred */
1626 if (idle)
1627 goto out_unlock;
1628
1629 __refill_cfs_bandwidth_runtime(cfs_b);
1630
1631 if (!throttled) {
1632 /* mark as potentially idle for the upcoming period */
1633 cfs_b->idle = 1;
1634 goto out_unlock;
1635 }
1636
1637 /* account preceding periods in which throttling occurred */
1638 cfs_b->nr_throttled += overrun;
1639
1640 /*
1641 * There are throttled entities so we must first use the new bandwidth
1642 * to unthrottle them before making it generally available. This
1643 * ensures that all existing debts will be paid before a new cfs_rq is
1644 * allowed to run.
1645 */
1646 runtime = cfs_b->runtime;
1647 runtime_expires = cfs_b->runtime_expires;
1648 cfs_b->runtime = 0;
1649
1650 /*
1651 * This check is repeated as we are holding onto the new bandwidth
1652 * while we unthrottle. This can potentially race with an unthrottled
1653 * group trying to acquire new bandwidth from the global pool.
1654 */
1655 while (throttled && runtime > 0) {
1656 raw_spin_unlock(&cfs_b->lock);
1657 /* we can't nest cfs_b->lock while distributing bandwidth */
1658 runtime = distribute_cfs_runtime(cfs_b, runtime,
1659 runtime_expires);
1660 raw_spin_lock(&cfs_b->lock);
1661
1662 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1663 }
1664
1665 /* return (any) remaining runtime */
1666 cfs_b->runtime = runtime;
1667 /*
1668 * While we are ensured activity in the period following an
1669 * unthrottle, this also covers the case in which the new bandwidth is
1670 * insufficient to cover the existing bandwidth deficit. (Forcing the
1671 * timer to remain active while there are any throttled entities.)
1672 */
1673 cfs_b->idle = 0;
1674out_unlock:
1675 if (idle)
1676 cfs_b->timer_active = 0;
1677 raw_spin_unlock(&cfs_b->lock);
1678
1679 return idle;
1680}
1681
1682/* a cfs_rq won't donate quota below this amount */
1683static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
1684/* minimum remaining period time to redistribute slack quota */
1685static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
1686/* how long we wait to gather additional slack before distributing */
1687static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
1688
1689/* are we near the end of the current quota period? */
1690static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
1691{
1692 struct hrtimer *refresh_timer = &cfs_b->period_timer;
1693 u64 remaining;
1694
1695 /* if the call-back is running a quota refresh is already occurring */
1696 if (hrtimer_callback_running(refresh_timer))
1697 return 1;
1698
1699 /* is a quota refresh about to occur? */
1700 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
1701 if (remaining < min_expire)
1702 return 1;
1703
1704 return 0;
1705}
1706
1707static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
1708{
1709 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
1710
1711 /* if there's a quota refresh soon don't bother with slack */
1712 if (runtime_refresh_within(cfs_b, min_left))
1713 return;
1714
1715 start_bandwidth_timer(&cfs_b->slack_timer,
1716 ns_to_ktime(cfs_bandwidth_slack_period));
1717}
1718
1719/* we know any runtime found here is valid as update_curr() precedes return */
1720static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1721{
1722 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1723 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
1724
1725 if (slack_runtime <= 0)
1726 return;
1727
1728 raw_spin_lock(&cfs_b->lock);
1729 if (cfs_b->quota != RUNTIME_INF &&
1730 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
1731 cfs_b->runtime += slack_runtime;
1732
1733 /* we are under rq->lock, defer unthrottling using a timer */
1734 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
1735 !list_empty(&cfs_b->throttled_cfs_rq))
1736 start_cfs_slack_bandwidth(cfs_b);
1737 }
1738 raw_spin_unlock(&cfs_b->lock);
1739
1740 /* even if it's not valid for return we don't want to try again */
1741 cfs_rq->runtime_remaining -= slack_runtime;
1742}
1743
1744static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1745{
1746 if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
1747 return;
1748
1749 __return_cfs_rq_runtime(cfs_rq);
1750}
1751
1752/*
1753 * This is done with a timer (instead of inline with bandwidth return) since
1754 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
1755 */
1756static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
1757{
1758 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
1759 u64 expires;
1760
1761 /* confirm we're still not at a refresh boundary */
1762 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
1763 return;
1764
1765 raw_spin_lock(&cfs_b->lock);
1766 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
1767 runtime = cfs_b->runtime;
1768 cfs_b->runtime = 0;
1769 }
1770 expires = cfs_b->runtime_expires;
1771 raw_spin_unlock(&cfs_b->lock);
1772
1773 if (!runtime)
1774 return;
1775
1776 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
1777
1778 raw_spin_lock(&cfs_b->lock);
1779 if (expires == cfs_b->runtime_expires)
1780 cfs_b->runtime = runtime;
1781 raw_spin_unlock(&cfs_b->lock);
1782}
1783
1784/*
1785 * When a group wakes up we want to make sure that its quota is not already
1786 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
1787 * runtime as update_curr() throttling can not not trigger until it's on-rq.
1788 */
1789static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1790{
1791 /* an active group must be handled by the update_curr()->put() path */
1792 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
1793 return;
1794
1795 /* ensure the group is not already throttled */
1796 if (cfs_rq_throttled(cfs_rq))
1797 return;
1798
1799 /* update runtime allocation */
1800 account_cfs_rq_runtime(cfs_rq, 0);
1801 if (cfs_rq->runtime_remaining <= 0)
1802 throttle_cfs_rq(cfs_rq);
1803}
1804
1805/* conditionally throttle active cfs_rq's from put_prev_entity() */
1806static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1807{
1808 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
1809 return;
1810
1811 /*
1812 * it's possible for a throttled entity to be forced into a running
1813 * state (e.g. set_curr_task), in this case we're finished.
1814 */
1815 if (cfs_rq_throttled(cfs_rq))
1816 return;
1817
1818 throttle_cfs_rq(cfs_rq);
1819}
1820#else
1821static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1822 unsigned long delta_exec) {}
1823static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1824static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
1825static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1826
1827static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1828{
1829 return 0;
1830}
1831
1832static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1833{
1834 return 0;
1835}
1836
1837static inline int throttled_lb_pair(struct task_group *tg,
1838 int src_cpu, int dest_cpu)
1839{
1840 return 0;
1841}
1842#endif
1843
1240/************************************************** 1844/**************************************************
1241 * CFS operations on tasks: 1845 * CFS operations on tasks:
1242 */ 1846 */
@@ -1313,16 +1917,33 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1313 break; 1917 break;
1314 cfs_rq = cfs_rq_of(se); 1918 cfs_rq = cfs_rq_of(se);
1315 enqueue_entity(cfs_rq, se, flags); 1919 enqueue_entity(cfs_rq, se, flags);
1920
1921 /*
1922 * end evaluation on encountering a throttled cfs_rq
1923 *
1924 * note: in the case of encountering a throttled cfs_rq we will
1925 * post the final h_nr_running increment below.
1926 */
1927 if (cfs_rq_throttled(cfs_rq))
1928 break;
1929 cfs_rq->h_nr_running++;
1930
1316 flags = ENQUEUE_WAKEUP; 1931 flags = ENQUEUE_WAKEUP;
1317 } 1932 }
1318 1933
1319 for_each_sched_entity(se) { 1934 for_each_sched_entity(se) {
1320 cfs_rq = cfs_rq_of(se); 1935 cfs_rq = cfs_rq_of(se);
1936 cfs_rq->h_nr_running++;
1937
1938 if (cfs_rq_throttled(cfs_rq))
1939 break;
1321 1940
1322 update_cfs_load(cfs_rq, 0); 1941 update_cfs_load(cfs_rq, 0);
1323 update_cfs_shares(cfs_rq); 1942 update_cfs_shares(cfs_rq);
1324 } 1943 }
1325 1944
1945 if (!se)
1946 inc_nr_running(rq);
1326 hrtick_update(rq); 1947 hrtick_update(rq);
1327} 1948}
1328 1949
@@ -1343,6 +1964,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1343 cfs_rq = cfs_rq_of(se); 1964 cfs_rq = cfs_rq_of(se);
1344 dequeue_entity(cfs_rq, se, flags); 1965 dequeue_entity(cfs_rq, se, flags);
1345 1966
1967 /*
1968 * end evaluation on encountering a throttled cfs_rq
1969 *
1970 * note: in the case of encountering a throttled cfs_rq we will
1971 * post the final h_nr_running decrement below.
1972 */
1973 if (cfs_rq_throttled(cfs_rq))
1974 break;
1975 cfs_rq->h_nr_running--;
1976
1346 /* Don't dequeue parent if it has other entities besides us */ 1977 /* Don't dequeue parent if it has other entities besides us */
1347 if (cfs_rq->load.weight) { 1978 if (cfs_rq->load.weight) {
1348 /* 1979 /*
@@ -1361,11 +1992,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1361 1992
1362 for_each_sched_entity(se) { 1993 for_each_sched_entity(se) {
1363 cfs_rq = cfs_rq_of(se); 1994 cfs_rq = cfs_rq_of(se);
1995 cfs_rq->h_nr_running--;
1996
1997 if (cfs_rq_throttled(cfs_rq))
1998 break;
1364 1999
1365 update_cfs_load(cfs_rq, 0); 2000 update_cfs_load(cfs_rq, 0);
1366 update_cfs_shares(cfs_rq); 2001 update_cfs_shares(cfs_rq);
1367 } 2002 }
1368 2003
2004 if (!se)
2005 dec_nr_running(rq);
1369 hrtick_update(rq); 2006 hrtick_update(rq);
1370} 2007}
1371 2008
@@ -1434,7 +2071,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1434 2071
1435 return wl; 2072 return wl;
1436} 2073}
1437
1438#else 2074#else
1439 2075
1440static inline unsigned long effective_load(struct task_group *tg, int cpu, 2076static inline unsigned long effective_load(struct task_group *tg, int cpu,
@@ -1547,7 +2183,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1547 2183
1548 /* Skip over this group if it has no CPUs allowed */ 2184 /* Skip over this group if it has no CPUs allowed */
1549 if (!cpumask_intersects(sched_group_cpus(group), 2185 if (!cpumask_intersects(sched_group_cpus(group),
1550 &p->cpus_allowed)) 2186 tsk_cpus_allowed(p)))
1551 continue; 2187 continue;
1552 2188
1553 local_group = cpumask_test_cpu(this_cpu, 2189 local_group = cpumask_test_cpu(this_cpu,
@@ -1593,7 +2229,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1593 int i; 2229 int i;
1594 2230
1595 /* Traverse only the allowed CPUs */ 2231 /* Traverse only the allowed CPUs */
1596 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { 2232 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
1597 load = weighted_cpuload(i); 2233 load = weighted_cpuload(i);
1598 2234
1599 if (load < min_load || (load == min_load && i == this_cpu)) { 2235 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1637,7 +2273,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1637 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) 2273 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1638 break; 2274 break;
1639 2275
1640 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { 2276 for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) {
1641 if (idle_cpu(i)) { 2277 if (idle_cpu(i)) {
1642 target = i; 2278 target = i;
1643 break; 2279 break;
@@ -1680,7 +2316,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1680 int sync = wake_flags & WF_SYNC; 2316 int sync = wake_flags & WF_SYNC;
1681 2317
1682 if (sd_flag & SD_BALANCE_WAKE) { 2318 if (sd_flag & SD_BALANCE_WAKE) {
1683 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) 2319 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
1684 want_affine = 1; 2320 want_affine = 1;
1685 new_cpu = prev_cpu; 2321 new_cpu = prev_cpu;
1686 } 2322 }
@@ -1875,6 +2511,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1875 if (unlikely(se == pse)) 2511 if (unlikely(se == pse))
1876 return; 2512 return;
1877 2513
2514 /*
2515 * This is possible from callers such as pull_task(), in which we
2516 * unconditionally check_prempt_curr() after an enqueue (which may have
2517 * lead to a throttle). This both saves work and prevents false
2518 * next-buddy nomination below.
2519 */
2520 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
2521 return;
2522
1878 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { 2523 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
1879 set_next_buddy(pse); 2524 set_next_buddy(pse);
1880 next_buddy_marked = 1; 2525 next_buddy_marked = 1;
@@ -1883,6 +2528,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1883 /* 2528 /*
1884 * We can come here with TIF_NEED_RESCHED already set from new task 2529 * We can come here with TIF_NEED_RESCHED already set from new task
1885 * wake up path. 2530 * wake up path.
2531 *
2532 * Note: this also catches the edge-case of curr being in a throttled
2533 * group (e.g. via set_curr_task), since update_curr() (in the
2534 * enqueue of curr) will have resulted in resched being set. This
2535 * prevents us from potentially nominating it as a false LAST_BUDDY
2536 * below.
1886 */ 2537 */
1887 if (test_tsk_need_resched(curr)) 2538 if (test_tsk_need_resched(curr))
1888 return; 2539 return;
@@ -1899,10 +2550,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1899 if (unlikely(p->policy != SCHED_NORMAL)) 2550 if (unlikely(p->policy != SCHED_NORMAL))
1900 return; 2551 return;
1901 2552
1902
1903 if (!sched_feat(WAKEUP_PREEMPT))
1904 return;
1905
1906 find_matching_se(&se, &pse); 2553 find_matching_se(&se, &pse);
1907 update_curr(cfs_rq_of(se)); 2554 update_curr(cfs_rq_of(se));
1908 BUG_ON(!pse); 2555 BUG_ON(!pse);
@@ -2005,7 +2652,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
2005{ 2652{
2006 struct sched_entity *se = &p->se; 2653 struct sched_entity *se = &p->se;
2007 2654
2008 if (!se->on_rq) 2655 /* throttled hierarchies are not runnable */
2656 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
2009 return false; 2657 return false;
2010 2658
2011 /* Tell the scheduler that we'd really like pse to run next. */ 2659 /* Tell the scheduler that we'd really like pse to run next. */
@@ -2049,7 +2697,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2049 * 2) cannot be migrated to this CPU due to cpus_allowed, or 2697 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2050 * 3) are cache-hot on their current CPU. 2698 * 3) are cache-hot on their current CPU.
2051 */ 2699 */
2052 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { 2700 if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
2053 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 2701 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
2054 return 0; 2702 return 0;
2055 } 2703 }
@@ -2102,6 +2750,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2102 2750
2103 for_each_leaf_cfs_rq(busiest, cfs_rq) { 2751 for_each_leaf_cfs_rq(busiest, cfs_rq) {
2104 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { 2752 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
2753 if (throttled_lb_pair(task_group(p),
2754 busiest->cpu, this_cpu))
2755 break;
2105 2756
2106 if (!can_migrate_task(p, busiest, this_cpu, 2757 if (!can_migrate_task(p, busiest, this_cpu,
2107 sd, idle, &pinned)) 2758 sd, idle, &pinned))
@@ -2217,8 +2868,13 @@ static void update_shares(int cpu)
2217 * Iterates the task_group tree in a bottom up fashion, see 2868 * Iterates the task_group tree in a bottom up fashion, see
2218 * list_add_leaf_cfs_rq() for details. 2869 * list_add_leaf_cfs_rq() for details.
2219 */ 2870 */
2220 for_each_leaf_cfs_rq(rq, cfs_rq) 2871 for_each_leaf_cfs_rq(rq, cfs_rq) {
2872 /* throttled entities do not contribute to load */
2873 if (throttled_hierarchy(cfs_rq))
2874 continue;
2875
2221 update_shares_cpu(cfs_rq->tg, cpu); 2876 update_shares_cpu(cfs_rq->tg, cpu);
2877 }
2222 rcu_read_unlock(); 2878 rcu_read_unlock();
2223} 2879}
2224 2880
@@ -2268,9 +2924,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2268 u64 rem_load, moved_load; 2924 u64 rem_load, moved_load;
2269 2925
2270 /* 2926 /*
2271 * empty group 2927 * empty group or part of a throttled hierarchy
2272 */ 2928 */
2273 if (!busiest_cfs_rq->task_weight) 2929 if (!busiest_cfs_rq->task_weight ||
2930 throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
2274 continue; 2931 continue;
2275 2932
2276 rem_load = (u64)rem_load_move * busiest_weight; 2933 rem_load = (u64)rem_load_move * busiest_weight;
@@ -3430,7 +4087,7 @@ redo:
3430 * moved to this_cpu 4087 * moved to this_cpu
3431 */ 4088 */
3432 if (!cpumask_test_cpu(this_cpu, 4089 if (!cpumask_test_cpu(this_cpu,
3433 &busiest->curr->cpus_allowed)) { 4090 tsk_cpus_allowed(busiest->curr))) {
3434 raw_spin_unlock_irqrestore(&busiest->lock, 4091 raw_spin_unlock_irqrestore(&busiest->lock,
3435 flags); 4092 flags);
3436 all_pinned = 1; 4093 all_pinned = 1;
@@ -3612,22 +4269,6 @@ out_unlock:
3612} 4269}
3613 4270
3614#ifdef CONFIG_NO_HZ 4271#ifdef CONFIG_NO_HZ
3615
3616static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
3617
3618static void trigger_sched_softirq(void *data)
3619{
3620 raise_softirq_irqoff(SCHED_SOFTIRQ);
3621}
3622
3623static inline void init_sched_softirq_csd(struct call_single_data *csd)
3624{
3625 csd->func = trigger_sched_softirq;
3626 csd->info = NULL;
3627 csd->flags = 0;
3628 csd->priv = 0;
3629}
3630
3631/* 4272/*
3632 * idle load balancing details 4273 * idle load balancing details
3633 * - One of the idle CPUs nominates itself as idle load_balancer, while 4274 * - One of the idle CPUs nominates itself as idle load_balancer, while
@@ -3667,7 +4308,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3667 struct sched_domain *sd; 4308 struct sched_domain *sd;
3668 4309
3669 for_each_domain(cpu, sd) 4310 for_each_domain(cpu, sd)
3670 if (sd && (sd->flags & flag)) 4311 if (sd->flags & flag)
3671 break; 4312 break;
3672 4313
3673 return sd; 4314 return sd;
@@ -3793,11 +4434,16 @@ static void nohz_balancer_kick(int cpu)
3793 } 4434 }
3794 4435
3795 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { 4436 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
3796 struct call_single_data *cp;
3797
3798 cpu_rq(ilb_cpu)->nohz_balance_kick = 1; 4437 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
3799 cp = &per_cpu(remote_sched_softirq_cb, cpu); 4438
3800 __smp_call_function_single(ilb_cpu, cp, 0); 4439 smp_mb();
4440 /*
4441 * Use smp_send_reschedule() instead of resched_cpu().
4442 * This way we generate a sched IPI on the target cpu which
4443 * is idle. And the softirq performing nohz idle load balance
4444 * will be run before returning from the IPI.
4445 */
4446 smp_send_reschedule(ilb_cpu);
3801 } 4447 }
3802 return; 4448 return;
3803} 4449}
@@ -4030,7 +4676,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
4030 if (time_before(now, nohz.next_balance)) 4676 if (time_before(now, nohz.next_balance))
4031 return 0; 4677 return 0;
4032 4678
4033 if (rq->idle_at_tick) 4679 if (idle_cpu(cpu))
4034 return 0; 4680 return 0;
4035 4681
4036 first_pick_cpu = atomic_read(&nohz.first_pick_cpu); 4682 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
@@ -4066,7 +4712,7 @@ static void run_rebalance_domains(struct softirq_action *h)
4066{ 4712{
4067 int this_cpu = smp_processor_id(); 4713 int this_cpu = smp_processor_id();
4068 struct rq *this_rq = cpu_rq(this_cpu); 4714 struct rq *this_rq = cpu_rq(this_cpu);
4069 enum cpu_idle_type idle = this_rq->idle_at_tick ? 4715 enum cpu_idle_type idle = this_rq->idle_balance ?
4070 CPU_IDLE : CPU_NOT_IDLE; 4716 CPU_IDLE : CPU_NOT_IDLE;
4071 4717
4072 rebalance_domains(this_cpu, idle); 4718 rebalance_domains(this_cpu, idle);
@@ -4251,8 +4897,13 @@ static void set_curr_task_fair(struct rq *rq)
4251{ 4897{
4252 struct sched_entity *se = &rq->curr->se; 4898 struct sched_entity *se = &rq->curr->se;
4253 4899
4254 for_each_sched_entity(se) 4900 for_each_sched_entity(se) {
4255 set_next_entity(cfs_rq_of(se), se); 4901 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4902
4903 set_next_entity(cfs_rq, se);
4904 /* ensure bandwidth has been allocated on our new cfs_rq */
4905 account_cfs_rq_runtime(cfs_rq, 0);
4906 }
4256} 4907}
4257 4908
4258#ifdef CONFIG_FAIR_GROUP_SCHED 4909#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 2e74677cb040..efa0a7b75dde 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -12,11 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
12SCHED_FEAT(START_DEBIT, 1) 12SCHED_FEAT(START_DEBIT, 1)
13 13
14/* 14/*
15 * Should wakeups try to preempt running tasks.
16 */
17SCHED_FEAT(WAKEUP_PREEMPT, 1)
18
19/*
20 * Based on load and program behaviour, see if it makes sense to place 15 * Based on load and program behaviour, see if it makes sense to place
21 * a newly woken task on the same cpu as the task that woke it -- 16 * a newly woken task on the same cpu as the task that woke it --
22 * improve cache locality. Typically used with SYNC wakeups as 17 * improve cache locality. Typically used with SYNC wakeups as
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index af1177858be3..056cbd2e2a27 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -124,21 +124,33 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
124 update_rt_migration(rt_rq); 124 update_rt_migration(rt_rq);
125} 125}
126 126
127static inline int has_pushable_tasks(struct rq *rq)
128{
129 return !plist_head_empty(&rq->rt.pushable_tasks);
130}
131
127static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 132static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
128{ 133{
129 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 134 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
130 plist_node_init(&p->pushable_tasks, p->prio); 135 plist_node_init(&p->pushable_tasks, p->prio);
131 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); 136 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
137
138 /* Update the highest prio pushable task */
139 if (p->prio < rq->rt.highest_prio.next)
140 rq->rt.highest_prio.next = p->prio;
132} 141}
133 142
134static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 143static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
135{ 144{
136 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 145 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
137}
138 146
139static inline int has_pushable_tasks(struct rq *rq) 147 /* Update the new highest prio pushable task */
140{ 148 if (has_pushable_tasks(rq)) {
141 return !plist_head_empty(&rq->rt.pushable_tasks); 149 p = plist_first_entry(&rq->rt.pushable_tasks,
150 struct task_struct, pushable_tasks);
151 rq->rt.highest_prio.next = p->prio;
152 } else
153 rq->rt.highest_prio.next = MAX_RT_PRIO;
142} 154}
143 155
144#else 156#else
@@ -643,6 +655,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
643 655
644 if (rt_rq->rt_time > runtime) { 656 if (rt_rq->rt_time > runtime) {
645 rt_rq->rt_throttled = 1; 657 rt_rq->rt_throttled = 1;
658 printk_once(KERN_WARNING "sched: RT throttling activated\n");
646 if (rt_rq_throttled(rt_rq)) { 659 if (rt_rq_throttled(rt_rq)) {
647 sched_rt_rq_dequeue(rt_rq); 660 sched_rt_rq_dequeue(rt_rq);
648 return 1; 661 return 1;
@@ -698,47 +711,13 @@ static void update_curr_rt(struct rq *rq)
698 711
699#if defined CONFIG_SMP 712#if defined CONFIG_SMP
700 713
701static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
702
703static inline int next_prio(struct rq *rq)
704{
705 struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
706
707 if (next && rt_prio(next->prio))
708 return next->prio;
709 else
710 return MAX_RT_PRIO;
711}
712
713static void 714static void
714inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 715inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
715{ 716{
716 struct rq *rq = rq_of_rt_rq(rt_rq); 717 struct rq *rq = rq_of_rt_rq(rt_rq);
717 718
718 if (prio < prev_prio) { 719 if (rq->online && prio < prev_prio)
719 720 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
720 /*
721 * If the new task is higher in priority than anything on the
722 * run-queue, we know that the previous high becomes our
723 * next-highest.
724 */
725 rt_rq->highest_prio.next = prev_prio;
726
727 if (rq->online)
728 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
729
730 } else if (prio == rt_rq->highest_prio.curr)
731 /*
732 * If the next task is equal in priority to the highest on
733 * the run-queue, then we implicitly know that the next highest
734 * task cannot be any lower than current
735 */
736 rt_rq->highest_prio.next = prio;
737 else if (prio < rt_rq->highest_prio.next)
738 /*
739 * Otherwise, we need to recompute next-highest
740 */
741 rt_rq->highest_prio.next = next_prio(rq);
742} 721}
743 722
744static void 723static void
@@ -746,9 +725,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
746{ 725{
747 struct rq *rq = rq_of_rt_rq(rt_rq); 726 struct rq *rq = rq_of_rt_rq(rt_rq);
748 727
749 if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
750 rt_rq->highest_prio.next = next_prio(rq);
751
752 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 728 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
753 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 729 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
754} 730}
@@ -961,6 +937,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
961 937
962 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 938 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
963 enqueue_pushable_task(rq, p); 939 enqueue_pushable_task(rq, p);
940
941 inc_nr_running(rq);
964} 942}
965 943
966static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 944static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -971,6 +949,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
971 dequeue_rt_entity(rt_se); 949 dequeue_rt_entity(rt_se);
972 950
973 dequeue_pushable_task(rq, p); 951 dequeue_pushable_task(rq, p);
952
953 dec_nr_running(rq);
974} 954}
975 955
976/* 956/*
@@ -1017,10 +997,12 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1017 struct rq *rq; 997 struct rq *rq;
1018 int cpu; 998 int cpu;
1019 999
1020 if (sd_flag != SD_BALANCE_WAKE)
1021 return smp_processor_id();
1022
1023 cpu = task_cpu(p); 1000 cpu = task_cpu(p);
1001
1002 /* For anything but wake ups, just return the task_cpu */
1003 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1004 goto out;
1005
1024 rq = cpu_rq(cpu); 1006 rq = cpu_rq(cpu);
1025 1007
1026 rcu_read_lock(); 1008 rcu_read_lock();
@@ -1059,6 +1041,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1059 } 1041 }
1060 rcu_read_unlock(); 1042 rcu_read_unlock();
1061 1043
1044out:
1062 return cpu; 1045 return cpu;
1063} 1046}
1064 1047
@@ -1178,7 +1161,6 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
1178static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 1161static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1179{ 1162{
1180 update_curr_rt(rq); 1163 update_curr_rt(rq);
1181 p->se.exec_start = 0;
1182 1164
1183 /* 1165 /*
1184 * The previous task needs to be made eligible for pushing 1166 * The previous task needs to be made eligible for pushing
@@ -1198,7 +1180,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1198static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1180static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1199{ 1181{
1200 if (!task_running(rq, p) && 1182 if (!task_running(rq, p) &&
1201 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && 1183 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
1202 (p->rt.nr_cpus_allowed > 1)) 1184 (p->rt.nr_cpus_allowed > 1))
1203 return 1; 1185 return 1;
1204 return 0; 1186 return 0;
@@ -1343,7 +1325,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1343 */ 1325 */
1344 if (unlikely(task_rq(task) != rq || 1326 if (unlikely(task_rq(task) != rq ||
1345 !cpumask_test_cpu(lowest_rq->cpu, 1327 !cpumask_test_cpu(lowest_rq->cpu,
1346 &task->cpus_allowed) || 1328 tsk_cpus_allowed(task)) ||
1347 task_running(rq, task) || 1329 task_running(rq, task) ||
1348 !task->on_rq)) { 1330 !task->on_rq)) {
1349 1331
@@ -1394,6 +1376,7 @@ static int push_rt_task(struct rq *rq)
1394{ 1376{
1395 struct task_struct *next_task; 1377 struct task_struct *next_task;
1396 struct rq *lowest_rq; 1378 struct rq *lowest_rq;
1379 int ret = 0;
1397 1380
1398 if (!rq->rt.overloaded) 1381 if (!rq->rt.overloaded)
1399 return 0; 1382 return 0;
@@ -1426,7 +1409,7 @@ retry:
1426 if (!lowest_rq) { 1409 if (!lowest_rq) {
1427 struct task_struct *task; 1410 struct task_struct *task;
1428 /* 1411 /*
1429 * find lock_lowest_rq releases rq->lock 1412 * find_lock_lowest_rq releases rq->lock
1430 * so it is possible that next_task has migrated. 1413 * so it is possible that next_task has migrated.
1431 * 1414 *
1432 * We need to make sure that the task is still on the same 1415 * We need to make sure that the task is still on the same
@@ -1436,12 +1419,11 @@ retry:
1436 task = pick_next_pushable_task(rq); 1419 task = pick_next_pushable_task(rq);
1437 if (task_cpu(next_task) == rq->cpu && task == next_task) { 1420 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1438 /* 1421 /*
1439 * If we get here, the task hasn't moved at all, but 1422 * The task hasn't migrated, and is still the next
1440 * it has failed to push. We will not try again, 1423 * eligible task, but we failed to find a run-queue
1441 * since the other cpus will pull from us when they 1424 * to push it to. Do not retry in this case, since
1442 * are ready. 1425 * other cpus will pull from us when ready.
1443 */ 1426 */
1444 dequeue_pushable_task(rq, next_task);
1445 goto out; 1427 goto out;
1446 } 1428 }
1447 1429
@@ -1460,6 +1442,7 @@ retry:
1460 deactivate_task(rq, next_task, 0); 1442 deactivate_task(rq, next_task, 0);
1461 set_task_cpu(next_task, lowest_rq->cpu); 1443 set_task_cpu(next_task, lowest_rq->cpu);
1462 activate_task(lowest_rq, next_task, 0); 1444 activate_task(lowest_rq, next_task, 0);
1445 ret = 1;
1463 1446
1464 resched_task(lowest_rq->curr); 1447 resched_task(lowest_rq->curr);
1465 1448
@@ -1468,7 +1451,7 @@ retry:
1468out: 1451out:
1469 put_task_struct(next_task); 1452 put_task_struct(next_task);
1470 1453
1471 return 1; 1454 return ret;
1472} 1455}
1473 1456
1474static void push_rt_tasks(struct rq *rq) 1457static void push_rt_tasks(struct rq *rq)
@@ -1626,9 +1609,6 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1626 1609
1627 update_rt_migration(&rq->rt); 1610 update_rt_migration(&rq->rt);
1628 } 1611 }
1629
1630 cpumask_copy(&p->cpus_allowed, new_mask);
1631 p->rt.nr_cpus_allowed = weight;
1632} 1612}
1633 1613
1634/* Assumes rq->lock is held */ 1614/* Assumes rq->lock is held */
@@ -1863,4 +1843,3 @@ static void print_rt_stats(struct seq_file *m, int cpu)
1863 rcu_read_unlock(); 1843 rcu_read_unlock();
1864} 1844}
1865#endif /* CONFIG_SCHED_DEBUG */ 1845#endif /* CONFIG_SCHED_DEBUG */
1866
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 6f437632afab..8b44e7fa7fb3 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -34,11 +34,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
34static void 34static void
35enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) 35enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
36{ 36{
37 inc_nr_running(rq);
37} 38}
38 39
39static void 40static void
40dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) 41dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
41{ 42{
43 dec_nr_running(rq);
42} 44}
43 45
44static void yield_task_stop(struct rq *rq) 46static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 11d65b531e50..2d2ecdcc8cdb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -379,6 +379,16 @@ static struct ctl_table kern_table[] = {
379 .extra2 = &one, 379 .extra2 = &one,
380 }, 380 },
381#endif 381#endif
382#ifdef CONFIG_CFS_BANDWIDTH
383 {
384 .procname = "sched_cfs_bandwidth_slice_us",
385 .data = &sysctl_sched_cfs_bandwidth_slice,
386 .maxlen = sizeof(unsigned int),
387 .mode = 0644,
388 .proc_handler = proc_dointvec_minmax,
389 .extra1 = &one,
390 },
391#endif
382#ifdef CONFIG_PROVE_LOCKING 392#ifdef CONFIG_PROVE_LOCKING
383 { 393 {
384 .procname = "prove_locking", 394 .procname = "prove_locking",