aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c666
1 files changed, 549 insertions, 117 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 03ad0113801a..d87c6e5d4e8c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void)
196 return sysctl_sched_rt_runtime >= 0; 196 return sysctl_sched_rt_runtime >= 0;
197} 197}
198 198
199static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 199static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
200{ 200{
201 ktime_t now; 201 unsigned long delta;
202 ktime_t soft, hard, now;
203
204 for (;;) {
205 if (hrtimer_active(period_timer))
206 break;
207
208 now = hrtimer_cb_get_time(period_timer);
209 hrtimer_forward(period_timer, now, period);
202 210
211 soft = hrtimer_get_softexpires(period_timer);
212 hard = hrtimer_get_expires(period_timer);
213 delta = ktime_to_ns(ktime_sub(hard, soft));
214 __hrtimer_start_range_ns(period_timer, soft, delta,
215 HRTIMER_MODE_ABS_PINNED, 0);
216 }
217}
218
219static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
220{
203 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 221 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
204 return; 222 return;
205 223
@@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
207 return; 225 return;
208 226
209 raw_spin_lock(&rt_b->rt_runtime_lock); 227 raw_spin_lock(&rt_b->rt_runtime_lock);
210 for (;;) { 228 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
211 unsigned long delta;
212 ktime_t soft, hard;
213
214 if (hrtimer_active(&rt_b->rt_period_timer))
215 break;
216
217 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
218 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
219
220 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
221 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
222 delta = ktime_to_ns(ktime_sub(hard, soft));
223 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
224 HRTIMER_MODE_ABS_PINNED, 0);
225 }
226 raw_spin_unlock(&rt_b->rt_runtime_lock); 229 raw_spin_unlock(&rt_b->rt_runtime_lock);
227} 230}
228 231
@@ -247,6 +250,24 @@ struct cfs_rq;
247 250
248static LIST_HEAD(task_groups); 251static LIST_HEAD(task_groups);
249 252
253struct cfs_bandwidth {
254#ifdef CONFIG_CFS_BANDWIDTH
255 raw_spinlock_t lock;
256 ktime_t period;
257 u64 quota, runtime;
258 s64 hierarchal_quota;
259 u64 runtime_expires;
260
261 int idle, timer_active;
262 struct hrtimer period_timer, slack_timer;
263 struct list_head throttled_cfs_rq;
264
265 /* statistics */
266 int nr_periods, nr_throttled;
267 u64 throttled_time;
268#endif
269};
270
250/* task group related information */ 271/* task group related information */
251struct task_group { 272struct task_group {
252 struct cgroup_subsys_state css; 273 struct cgroup_subsys_state css;
@@ -278,6 +299,8 @@ struct task_group {
278#ifdef CONFIG_SCHED_AUTOGROUP 299#ifdef CONFIG_SCHED_AUTOGROUP
279 struct autogroup *autogroup; 300 struct autogroup *autogroup;
280#endif 301#endif
302
303 struct cfs_bandwidth cfs_bandwidth;
281}; 304};
282 305
283/* task_group_lock serializes the addition/removal of task groups */ 306/* task_group_lock serializes the addition/removal of task groups */
@@ -311,7 +334,7 @@ struct task_group root_task_group;
311/* CFS-related fields in a runqueue */ 334/* CFS-related fields in a runqueue */
312struct cfs_rq { 335struct cfs_rq {
313 struct load_weight load; 336 struct load_weight load;
314 unsigned long nr_running; 337 unsigned long nr_running, h_nr_running;
315 338
316 u64 exec_clock; 339 u64 exec_clock;
317 u64 min_vruntime; 340 u64 min_vruntime;
@@ -377,9 +400,120 @@ struct cfs_rq {
377 400
378 unsigned long load_contribution; 401 unsigned long load_contribution;
379#endif 402#endif
403#ifdef CONFIG_CFS_BANDWIDTH
404 int runtime_enabled;
405 u64 runtime_expires;
406 s64 runtime_remaining;
407
408 u64 throttled_timestamp;
409 int throttled, throttle_count;
410 struct list_head throttled_list;
411#endif
380#endif 412#endif
381}; 413};
382 414
415#ifdef CONFIG_FAIR_GROUP_SCHED
416#ifdef CONFIG_CFS_BANDWIDTH
417static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
418{
419 return &tg->cfs_bandwidth;
420}
421
422static inline u64 default_cfs_period(void);
423static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
424static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
425
426static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
427{
428 struct cfs_bandwidth *cfs_b =
429 container_of(timer, struct cfs_bandwidth, slack_timer);
430 do_sched_cfs_slack_timer(cfs_b);
431
432 return HRTIMER_NORESTART;
433}
434
435static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
436{
437 struct cfs_bandwidth *cfs_b =
438 container_of(timer, struct cfs_bandwidth, period_timer);
439 ktime_t now;
440 int overrun;
441 int idle = 0;
442
443 for (;;) {
444 now = hrtimer_cb_get_time(timer);
445 overrun = hrtimer_forward(timer, now, cfs_b->period);
446
447 if (!overrun)
448 break;
449
450 idle = do_sched_cfs_period_timer(cfs_b, overrun);
451 }
452
453 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
454}
455
456static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
457{
458 raw_spin_lock_init(&cfs_b->lock);
459 cfs_b->runtime = 0;
460 cfs_b->quota = RUNTIME_INF;
461 cfs_b->period = ns_to_ktime(default_cfs_period());
462
463 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
464 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
465 cfs_b->period_timer.function = sched_cfs_period_timer;
466 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
467 cfs_b->slack_timer.function = sched_cfs_slack_timer;
468}
469
470static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
471{
472 cfs_rq->runtime_enabled = 0;
473 INIT_LIST_HEAD(&cfs_rq->throttled_list);
474}
475
476/* requires cfs_b->lock, may release to reprogram timer */
477static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
478{
479 /*
480 * The timer may be active because we're trying to set a new bandwidth
481 * period or because we're racing with the tear-down path
482 * (timer_active==0 becomes visible before the hrtimer call-back
483 * terminates). In either case we ensure that it's re-programmed
484 */
485 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
486 raw_spin_unlock(&cfs_b->lock);
487 /* ensure cfs_b->lock is available while we wait */
488 hrtimer_cancel(&cfs_b->period_timer);
489
490 raw_spin_lock(&cfs_b->lock);
491 /* if someone else restarted the timer then we're done */
492 if (cfs_b->timer_active)
493 return;
494 }
495
496 cfs_b->timer_active = 1;
497 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
498}
499
500static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
501{
502 hrtimer_cancel(&cfs_b->period_timer);
503 hrtimer_cancel(&cfs_b->slack_timer);
504}
505#else
506static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
507static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
508static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
509
510static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
511{
512 return NULL;
513}
514#endif /* CONFIG_CFS_BANDWIDTH */
515#endif /* CONFIG_FAIR_GROUP_SCHED */
516
383/* Real-Time classes' related field in a runqueue: */ 517/* Real-Time classes' related field in a runqueue: */
384struct rt_rq { 518struct rt_rq {
385 struct rt_prio_array active; 519 struct rt_prio_array active;
@@ -510,7 +644,7 @@ struct rq {
510 644
511 unsigned long cpu_power; 645 unsigned long cpu_power;
512 646
513 unsigned char idle_at_tick; 647 unsigned char idle_balance;
514 /* For active balancing */ 648 /* For active balancing */
515 int post_schedule; 649 int post_schedule;
516 int active_balance; 650 int active_balance;
@@ -520,8 +654,6 @@ struct rq {
520 int cpu; 654 int cpu;
521 int online; 655 int online;
522 656
523 unsigned long avg_load_per_task;
524
525 u64 rt_avg; 657 u64 rt_avg;
526 u64 age_stamp; 658 u64 age_stamp;
527 u64 idle_stamp; 659 u64 idle_stamp;
@@ -570,7 +702,7 @@ struct rq {
570#endif 702#endif
571 703
572#ifdef CONFIG_SMP 704#ifdef CONFIG_SMP
573 struct task_struct *wake_list; 705 struct llist_head wake_list;
574#endif 706#endif
575}; 707};
576 708
@@ -1272,6 +1404,18 @@ void wake_up_idle_cpu(int cpu)
1272 smp_send_reschedule(cpu); 1404 smp_send_reschedule(cpu);
1273} 1405}
1274 1406
1407static inline bool got_nohz_idle_kick(void)
1408{
1409 return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
1410}
1411
1412#else /* CONFIG_NO_HZ */
1413
1414static inline bool got_nohz_idle_kick(void)
1415{
1416 return false;
1417}
1418
1275#endif /* CONFIG_NO_HZ */ 1419#endif /* CONFIG_NO_HZ */
1276 1420
1277static u64 sched_avg_period(void) 1421static u64 sched_avg_period(void)
@@ -1471,24 +1615,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1471 update_load_sub(&rq->load, load); 1615 update_load_sub(&rq->load, load);
1472} 1616}
1473 1617
1474#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) 1618#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1619 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1475typedef int (*tg_visitor)(struct task_group *, void *); 1620typedef int (*tg_visitor)(struct task_group *, void *);
1476 1621
1477/* 1622/*
1478 * Iterate the full tree, calling @down when first entering a node and @up when 1623 * Iterate task_group tree rooted at *from, calling @down when first entering a
1479 * leaving it for the final time. 1624 * node and @up when leaving it for the final time.
1625 *
1626 * Caller must hold rcu_lock or sufficient equivalent.
1480 */ 1627 */
1481static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) 1628static int walk_tg_tree_from(struct task_group *from,
1629 tg_visitor down, tg_visitor up, void *data)
1482{ 1630{
1483 struct task_group *parent, *child; 1631 struct task_group *parent, *child;
1484 int ret; 1632 int ret;
1485 1633
1486 rcu_read_lock(); 1634 parent = from;
1487 parent = &root_task_group; 1635
1488down: 1636down:
1489 ret = (*down)(parent, data); 1637 ret = (*down)(parent, data);
1490 if (ret) 1638 if (ret)
1491 goto out_unlock; 1639 goto out;
1492 list_for_each_entry_rcu(child, &parent->children, siblings) { 1640 list_for_each_entry_rcu(child, &parent->children, siblings) {
1493 parent = child; 1641 parent = child;
1494 goto down; 1642 goto down;
@@ -1497,19 +1645,29 @@ up:
1497 continue; 1645 continue;
1498 } 1646 }
1499 ret = (*up)(parent, data); 1647 ret = (*up)(parent, data);
1500 if (ret) 1648 if (ret || parent == from)
1501 goto out_unlock; 1649 goto out;
1502 1650
1503 child = parent; 1651 child = parent;
1504 parent = parent->parent; 1652 parent = parent->parent;
1505 if (parent) 1653 if (parent)
1506 goto up; 1654 goto up;
1507out_unlock: 1655out:
1508 rcu_read_unlock();
1509
1510 return ret; 1656 return ret;
1511} 1657}
1512 1658
1659/*
1660 * Iterate the full tree, calling @down when first entering a node and @up when
1661 * leaving it for the final time.
1662 *
1663 * Caller must hold rcu_lock or sufficient equivalent.
1664 */
1665
1666static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1667{
1668 return walk_tg_tree_from(&root_task_group, down, up, data);
1669}
1670
1513static int tg_nop(struct task_group *tg, void *data) 1671static int tg_nop(struct task_group *tg, void *data)
1514{ 1672{
1515 return 0; 1673 return 0;
@@ -1569,11 +1727,9 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1569 unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 1727 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1570 1728
1571 if (nr_running) 1729 if (nr_running)
1572 rq->avg_load_per_task = rq->load.weight / nr_running; 1730 return rq->load.weight / nr_running;
1573 else
1574 rq->avg_load_per_task = 0;
1575 1731
1576 return rq->avg_load_per_task; 1732 return 0;
1577} 1733}
1578 1734
1579#ifdef CONFIG_PREEMPT 1735#ifdef CONFIG_PREEMPT
@@ -1806,7 +1962,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1806 rq->nr_uninterruptible--; 1962 rq->nr_uninterruptible--;
1807 1963
1808 enqueue_task(rq, p, flags); 1964 enqueue_task(rq, p, flags);
1809 inc_nr_running(rq);
1810} 1965}
1811 1966
1812/* 1967/*
@@ -1818,7 +1973,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1818 rq->nr_uninterruptible++; 1973 rq->nr_uninterruptible++;
1819 1974
1820 dequeue_task(rq, p, flags); 1975 dequeue_task(rq, p, flags);
1821 dec_nr_running(rq);
1822} 1976}
1823 1977
1824#ifdef CONFIG_IRQ_TIME_ACCOUNTING 1978#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -2390,11 +2544,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2390 2544
2391 /* Look for allowed, online CPU in same node. */ 2545 /* Look for allowed, online CPU in same node. */
2392 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) 2546 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2393 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 2547 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
2394 return dest_cpu; 2548 return dest_cpu;
2395 2549
2396 /* Any allowed, online CPU? */ 2550 /* Any allowed, online CPU? */
2397 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); 2551 dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
2398 if (dest_cpu < nr_cpu_ids) 2552 if (dest_cpu < nr_cpu_ids)
2399 return dest_cpu; 2553 return dest_cpu;
2400 2554
@@ -2431,7 +2585,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2431 * [ this allows ->select_task() to simply return task_cpu(p) and 2585 * [ this allows ->select_task() to simply return task_cpu(p) and
2432 * not worry about this generic constraint ] 2586 * not worry about this generic constraint ]
2433 */ 2587 */
2434 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || 2588 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
2435 !cpu_online(cpu))) 2589 !cpu_online(cpu)))
2436 cpu = select_fallback_rq(task_cpu(p), p); 2590 cpu = select_fallback_rq(task_cpu(p), p);
2437 2591
@@ -2556,42 +2710,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
2556} 2710}
2557 2711
2558#ifdef CONFIG_SMP 2712#ifdef CONFIG_SMP
2559static void sched_ttwu_do_pending(struct task_struct *list) 2713static void sched_ttwu_pending(void)
2560{ 2714{
2561 struct rq *rq = this_rq(); 2715 struct rq *rq = this_rq();
2716 struct llist_node *llist = llist_del_all(&rq->wake_list);
2717 struct task_struct *p;
2562 2718
2563 raw_spin_lock(&rq->lock); 2719 raw_spin_lock(&rq->lock);
2564 2720
2565 while (list) { 2721 while (llist) {
2566 struct task_struct *p = list; 2722 p = llist_entry(llist, struct task_struct, wake_entry);
2567 list = list->wake_entry; 2723 llist = llist_next(llist);
2568 ttwu_do_activate(rq, p, 0); 2724 ttwu_do_activate(rq, p, 0);
2569 } 2725 }
2570 2726
2571 raw_spin_unlock(&rq->lock); 2727 raw_spin_unlock(&rq->lock);
2572} 2728}
2573 2729
2574#ifdef CONFIG_HOTPLUG_CPU
2575
2576static void sched_ttwu_pending(void)
2577{
2578 struct rq *rq = this_rq();
2579 struct task_struct *list = xchg(&rq->wake_list, NULL);
2580
2581 if (!list)
2582 return;
2583
2584 sched_ttwu_do_pending(list);
2585}
2586
2587#endif /* CONFIG_HOTPLUG_CPU */
2588
2589void scheduler_ipi(void) 2730void scheduler_ipi(void)
2590{ 2731{
2591 struct rq *rq = this_rq(); 2732 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
2592 struct task_struct *list = xchg(&rq->wake_list, NULL);
2593
2594 if (!list)
2595 return; 2733 return;
2596 2734
2597 /* 2735 /*
@@ -2608,25 +2746,21 @@ void scheduler_ipi(void)
2608 * somewhat pessimize the simple resched case. 2746 * somewhat pessimize the simple resched case.
2609 */ 2747 */
2610 irq_enter(); 2748 irq_enter();
2611 sched_ttwu_do_pending(list); 2749 sched_ttwu_pending();
2750
2751 /*
2752 * Check if someone kicked us for doing the nohz idle load balance.
2753 */
2754 if (unlikely(got_nohz_idle_kick() && !need_resched())) {
2755 this_rq()->idle_balance = 1;
2756 raise_softirq_irqoff(SCHED_SOFTIRQ);
2757 }
2612 irq_exit(); 2758 irq_exit();
2613} 2759}
2614 2760
2615static void ttwu_queue_remote(struct task_struct *p, int cpu) 2761static void ttwu_queue_remote(struct task_struct *p, int cpu)
2616{ 2762{
2617 struct rq *rq = cpu_rq(cpu); 2763 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
2618 struct task_struct *next = rq->wake_list;
2619
2620 for (;;) {
2621 struct task_struct *old = next;
2622
2623 p->wake_entry = next;
2624 next = cmpxchg(&rq->wake_list, old, p);
2625 if (next == old)
2626 break;
2627 }
2628
2629 if (!next)
2630 smp_send_reschedule(cpu); 2764 smp_send_reschedule(cpu);
2631} 2765}
2632 2766
@@ -2848,19 +2982,23 @@ void sched_fork(struct task_struct *p)
2848 p->state = TASK_RUNNING; 2982 p->state = TASK_RUNNING;
2849 2983
2850 /* 2984 /*
2985 * Make sure we do not leak PI boosting priority to the child.
2986 */
2987 p->prio = current->normal_prio;
2988
2989 /*
2851 * Revert to default priority/policy on fork if requested. 2990 * Revert to default priority/policy on fork if requested.
2852 */ 2991 */
2853 if (unlikely(p->sched_reset_on_fork)) { 2992 if (unlikely(p->sched_reset_on_fork)) {
2854 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { 2993 if (task_has_rt_policy(p)) {
2855 p->policy = SCHED_NORMAL; 2994 p->policy = SCHED_NORMAL;
2856 p->normal_prio = p->static_prio;
2857 }
2858
2859 if (PRIO_TO_NICE(p->static_prio) < 0) {
2860 p->static_prio = NICE_TO_PRIO(0); 2995 p->static_prio = NICE_TO_PRIO(0);
2861 p->normal_prio = p->static_prio; 2996 p->rt_priority = 0;
2862 set_load_weight(p); 2997 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2863 } 2998 p->static_prio = NICE_TO_PRIO(0);
2999
3000 p->prio = p->normal_prio = __normal_prio(p);
3001 set_load_weight(p);
2864 3002
2865 /* 3003 /*
2866 * We don't need the reset flag anymore after the fork. It has 3004 * We don't need the reset flag anymore after the fork. It has
@@ -2869,11 +3007,6 @@ void sched_fork(struct task_struct *p)
2869 p->sched_reset_on_fork = 0; 3007 p->sched_reset_on_fork = 0;
2870 } 3008 }
2871 3009
2872 /*
2873 * Make sure we do not leak PI boosting priority to the child.
2874 */
2875 p->prio = current->normal_prio;
2876
2877 if (!rt_prio(p->prio)) 3010 if (!rt_prio(p->prio))
2878 p->sched_class = &fair_sched_class; 3011 p->sched_class = &fair_sched_class;
2879 3012
@@ -4116,7 +4249,7 @@ void scheduler_tick(void)
4116 perf_event_task_tick(); 4249 perf_event_task_tick();
4117 4250
4118#ifdef CONFIG_SMP 4251#ifdef CONFIG_SMP
4119 rq->idle_at_tick = idle_cpu(cpu); 4252 rq->idle_balance = idle_cpu(cpu);
4120 trigger_load_balance(rq, cpu); 4253 trigger_load_balance(rq, cpu);
4121#endif 4254#endif
4122} 4255}
@@ -4240,7 +4373,7 @@ pick_next_task(struct rq *rq)
4240 * Optimization: we know that if all tasks are in 4373 * Optimization: we know that if all tasks are in
4241 * the fair class we can call that function directly: 4374 * the fair class we can call that function directly:
4242 */ 4375 */
4243 if (likely(rq->nr_running == rq->cfs.nr_running)) { 4376 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
4244 p = fair_sched_class.pick_next_task(rq); 4377 p = fair_sched_class.pick_next_task(rq);
4245 if (likely(p)) 4378 if (likely(p))
4246 return p; 4379 return p;
@@ -5026,7 +5159,20 @@ EXPORT_SYMBOL(task_nice);
5026 */ 5159 */
5027int idle_cpu(int cpu) 5160int idle_cpu(int cpu)
5028{ 5161{
5029 return cpu_curr(cpu) == cpu_rq(cpu)->idle; 5162 struct rq *rq = cpu_rq(cpu);
5163
5164 if (rq->curr != rq->idle)
5165 return 0;
5166
5167 if (rq->nr_running)
5168 return 0;
5169
5170#ifdef CONFIG_SMP
5171 if (!llist_empty(&rq->wake_list))
5172 return 0;
5173#endif
5174
5175 return 1;
5030} 5176}
5031 5177
5032/** 5178/**
@@ -5876,7 +6022,7 @@ void show_state_filter(unsigned long state_filter)
5876 printk(KERN_INFO 6022 printk(KERN_INFO
5877 " task PC stack pid father\n"); 6023 " task PC stack pid father\n");
5878#endif 6024#endif
5879 read_lock(&tasklist_lock); 6025 rcu_read_lock();
5880 do_each_thread(g, p) { 6026 do_each_thread(g, p) {
5881 /* 6027 /*
5882 * reset the NMI-timeout, listing all files on a slow 6028 * reset the NMI-timeout, listing all files on a slow
@@ -5892,7 +6038,7 @@ void show_state_filter(unsigned long state_filter)
5892#ifdef CONFIG_SCHED_DEBUG 6038#ifdef CONFIG_SCHED_DEBUG
5893 sysrq_sched_debug_show(); 6039 sysrq_sched_debug_show();
5894#endif 6040#endif
5895 read_unlock(&tasklist_lock); 6041 rcu_read_unlock();
5896 /* 6042 /*
5897 * Only show locks if all tasks are dumped: 6043 * Only show locks if all tasks are dumped:
5898 */ 6044 */
@@ -6007,10 +6153,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
6007{ 6153{
6008 if (p->sched_class && p->sched_class->set_cpus_allowed) 6154 if (p->sched_class && p->sched_class->set_cpus_allowed)
6009 p->sched_class->set_cpus_allowed(p, new_mask); 6155 p->sched_class->set_cpus_allowed(p, new_mask);
6010 else { 6156
6011 cpumask_copy(&p->cpus_allowed, new_mask); 6157 cpumask_copy(&p->cpus_allowed, new_mask);
6012 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 6158 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
6013 }
6014} 6159}
6015 6160
6016/* 6161/*
@@ -6108,7 +6253,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6108 if (task_cpu(p) != src_cpu) 6253 if (task_cpu(p) != src_cpu)
6109 goto done; 6254 goto done;
6110 /* Affinity changed (again). */ 6255 /* Affinity changed (again). */
6111 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 6256 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
6112 goto fail; 6257 goto fail;
6113 6258
6114 /* 6259 /*
@@ -6189,6 +6334,30 @@ static void calc_global_load_remove(struct rq *rq)
6189 rq->calc_load_active = 0; 6334 rq->calc_load_active = 0;
6190} 6335}
6191 6336
6337#ifdef CONFIG_CFS_BANDWIDTH
6338static void unthrottle_offline_cfs_rqs(struct rq *rq)
6339{
6340 struct cfs_rq *cfs_rq;
6341
6342 for_each_leaf_cfs_rq(rq, cfs_rq) {
6343 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6344
6345 if (!cfs_rq->runtime_enabled)
6346 continue;
6347
6348 /*
6349 * clock_task is not advancing so we just need to make sure
6350 * there's some valid quota amount
6351 */
6352 cfs_rq->runtime_remaining = cfs_b->quota;
6353 if (cfs_rq_throttled(cfs_rq))
6354 unthrottle_cfs_rq(cfs_rq);
6355 }
6356}
6357#else
6358static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6359#endif
6360
6192/* 6361/*
6193 * Migrate all tasks from the rq, sleeping tasks will be migrated by 6362 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6194 * try_to_wake_up()->select_task_rq(). 6363 * try_to_wake_up()->select_task_rq().
@@ -6214,6 +6383,9 @@ static void migrate_tasks(unsigned int dead_cpu)
6214 */ 6383 */
6215 rq->stop = NULL; 6384 rq->stop = NULL;
6216 6385
6386 /* Ensure any throttled groups are reachable by pick_next_task */
6387 unthrottle_offline_cfs_rqs(rq);
6388
6217 for ( ; ; ) { 6389 for ( ; ; ) {
6218 /* 6390 /*
6219 * There's this thread running, bail when that's the only 6391 * There's this thread running, bail when that's the only
@@ -7957,6 +8129,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7957 /* allow initial update_cfs_load() to truncate */ 8129 /* allow initial update_cfs_load() to truncate */
7958 cfs_rq->load_stamp = 1; 8130 cfs_rq->load_stamp = 1;
7959#endif 8131#endif
8132 init_cfs_rq_runtime(cfs_rq);
7960 8133
7961 tg->cfs_rq[cpu] = cfs_rq; 8134 tg->cfs_rq[cpu] = cfs_rq;
7962 tg->se[cpu] = se; 8135 tg->se[cpu] = se;
@@ -8096,6 +8269,7 @@ void __init sched_init(void)
8096 * We achieve this by letting root_task_group's tasks sit 8269 * We achieve this by letting root_task_group's tasks sit
8097 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 8270 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
8098 */ 8271 */
8272 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
8099 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 8273 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8100#endif /* CONFIG_FAIR_GROUP_SCHED */ 8274#endif /* CONFIG_FAIR_GROUP_SCHED */
8101 8275
@@ -8125,7 +8299,6 @@ void __init sched_init(void)
8125 rq_attach_root(rq, &def_root_domain); 8299 rq_attach_root(rq, &def_root_domain);
8126#ifdef CONFIG_NO_HZ 8300#ifdef CONFIG_NO_HZ
8127 rq->nohz_balance_kick = 0; 8301 rq->nohz_balance_kick = 0;
8128 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
8129#endif 8302#endif
8130#endif 8303#endif
8131 init_rq_hrtick(rq); 8304 init_rq_hrtick(rq);
@@ -8336,6 +8509,8 @@ static void free_fair_sched_group(struct task_group *tg)
8336{ 8509{
8337 int i; 8510 int i;
8338 8511
8512 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8513
8339 for_each_possible_cpu(i) { 8514 for_each_possible_cpu(i) {
8340 if (tg->cfs_rq) 8515 if (tg->cfs_rq)
8341 kfree(tg->cfs_rq[i]); 8516 kfree(tg->cfs_rq[i]);
@@ -8363,6 +8538,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8363 8538
8364 tg->shares = NICE_0_LOAD; 8539 tg->shares = NICE_0_LOAD;
8365 8540
8541 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8542
8366 for_each_possible_cpu(i) { 8543 for_each_possible_cpu(i) {
8367 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8544 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8368 GFP_KERNEL, cpu_to_node(i)); 8545 GFP_KERNEL, cpu_to_node(i));
@@ -8638,12 +8815,7 @@ unsigned long sched_group_shares(struct task_group *tg)
8638} 8815}
8639#endif 8816#endif
8640 8817
8641#ifdef CONFIG_RT_GROUP_SCHED 8818#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
8642/*
8643 * Ensure that the real time constraints are schedulable.
8644 */
8645static DEFINE_MUTEX(rt_constraints_mutex);
8646
8647static unsigned long to_ratio(u64 period, u64 runtime) 8819static unsigned long to_ratio(u64 period, u64 runtime)
8648{ 8820{
8649 if (runtime == RUNTIME_INF) 8821 if (runtime == RUNTIME_INF)
@@ -8651,6 +8823,13 @@ static unsigned long to_ratio(u64 period, u64 runtime)
8651 8823
8652 return div64_u64(runtime << 20, period); 8824 return div64_u64(runtime << 20, period);
8653} 8825}
8826#endif
8827
8828#ifdef CONFIG_RT_GROUP_SCHED
8829/*
8830 * Ensure that the real time constraints are schedulable.
8831 */
8832static DEFINE_MUTEX(rt_constraints_mutex);
8654 8833
8655/* Must be called with tasklist_lock held */ 8834/* Must be called with tasklist_lock held */
8656static inline int tg_has_rt_tasks(struct task_group *tg) 8835static inline int tg_has_rt_tasks(struct task_group *tg)
@@ -8671,7 +8850,7 @@ struct rt_schedulable_data {
8671 u64 rt_runtime; 8850 u64 rt_runtime;
8672}; 8851};
8673 8852
8674static int tg_schedulable(struct task_group *tg, void *data) 8853static int tg_rt_schedulable(struct task_group *tg, void *data)
8675{ 8854{
8676 struct rt_schedulable_data *d = data; 8855 struct rt_schedulable_data *d = data;
8677 struct task_group *child; 8856 struct task_group *child;
@@ -8729,16 +8908,22 @@ static int tg_schedulable(struct task_group *tg, void *data)
8729 8908
8730static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8909static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8731{ 8910{
8911 int ret;
8912
8732 struct rt_schedulable_data data = { 8913 struct rt_schedulable_data data = {
8733 .tg = tg, 8914 .tg = tg,
8734 .rt_period = period, 8915 .rt_period = period,
8735 .rt_runtime = runtime, 8916 .rt_runtime = runtime,
8736 }; 8917 };
8737 8918
8738 return walk_tg_tree(tg_schedulable, tg_nop, &data); 8919 rcu_read_lock();
8920 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
8921 rcu_read_unlock();
8922
8923 return ret;
8739} 8924}
8740 8925
8741static int tg_set_bandwidth(struct task_group *tg, 8926static int tg_set_rt_bandwidth(struct task_group *tg,
8742 u64 rt_period, u64 rt_runtime) 8927 u64 rt_period, u64 rt_runtime)
8743{ 8928{
8744 int i, err = 0; 8929 int i, err = 0;
@@ -8777,7 +8962,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8777 if (rt_runtime_us < 0) 8962 if (rt_runtime_us < 0)
8778 rt_runtime = RUNTIME_INF; 8963 rt_runtime = RUNTIME_INF;
8779 8964
8780 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8965 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8781} 8966}
8782 8967
8783long sched_group_rt_runtime(struct task_group *tg) 8968long sched_group_rt_runtime(struct task_group *tg)
@@ -8802,7 +8987,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8802 if (rt_period == 0) 8987 if (rt_period == 0)
8803 return -EINVAL; 8988 return -EINVAL;
8804 8989
8805 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8990 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8806} 8991}
8807 8992
8808long sched_group_rt_period(struct task_group *tg) 8993long sched_group_rt_period(struct task_group *tg)
@@ -8992,6 +9177,238 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8992 9177
8993 return (u64) scale_load_down(tg->shares); 9178 return (u64) scale_load_down(tg->shares);
8994} 9179}
9180
9181#ifdef CONFIG_CFS_BANDWIDTH
9182static DEFINE_MUTEX(cfs_constraints_mutex);
9183
9184const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
9185const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
9186
9187static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9188
9189static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9190{
9191 int i, ret = 0, runtime_enabled;
9192 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9193
9194 if (tg == &root_task_group)
9195 return -EINVAL;
9196
9197 /*
9198 * Ensure we have at some amount of bandwidth every period. This is
9199 * to prevent reaching a state of large arrears when throttled via
9200 * entity_tick() resulting in prolonged exit starvation.
9201 */
9202 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
9203 return -EINVAL;
9204
9205 /*
9206 * Likewise, bound things on the otherside by preventing insane quota
9207 * periods. This also allows us to normalize in computing quota
9208 * feasibility.
9209 */
9210 if (period > max_cfs_quota_period)
9211 return -EINVAL;
9212
9213 mutex_lock(&cfs_constraints_mutex);
9214 ret = __cfs_schedulable(tg, period, quota);
9215 if (ret)
9216 goto out_unlock;
9217
9218 runtime_enabled = quota != RUNTIME_INF;
9219 raw_spin_lock_irq(&cfs_b->lock);
9220 cfs_b->period = ns_to_ktime(period);
9221 cfs_b->quota = quota;
9222
9223 __refill_cfs_bandwidth_runtime(cfs_b);
9224 /* restart the period timer (if active) to handle new period expiry */
9225 if (runtime_enabled && cfs_b->timer_active) {
9226 /* force a reprogram */
9227 cfs_b->timer_active = 0;
9228 __start_cfs_bandwidth(cfs_b);
9229 }
9230 raw_spin_unlock_irq(&cfs_b->lock);
9231
9232 for_each_possible_cpu(i) {
9233 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9234 struct rq *rq = rq_of(cfs_rq);
9235
9236 raw_spin_lock_irq(&rq->lock);
9237 cfs_rq->runtime_enabled = runtime_enabled;
9238 cfs_rq->runtime_remaining = 0;
9239
9240 if (cfs_rq_throttled(cfs_rq))
9241 unthrottle_cfs_rq(cfs_rq);
9242 raw_spin_unlock_irq(&rq->lock);
9243 }
9244out_unlock:
9245 mutex_unlock(&cfs_constraints_mutex);
9246
9247 return ret;
9248}
9249
9250int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9251{
9252 u64 quota, period;
9253
9254 period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
9255 if (cfs_quota_us < 0)
9256 quota = RUNTIME_INF;
9257 else
9258 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
9259
9260 return tg_set_cfs_bandwidth(tg, period, quota);
9261}
9262
9263long tg_get_cfs_quota(struct task_group *tg)
9264{
9265 u64 quota_us;
9266
9267 if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
9268 return -1;
9269
9270 quota_us = tg_cfs_bandwidth(tg)->quota;
9271 do_div(quota_us, NSEC_PER_USEC);
9272
9273 return quota_us;
9274}
9275
9276int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9277{
9278 u64 quota, period;
9279
9280 period = (u64)cfs_period_us * NSEC_PER_USEC;
9281 quota = tg_cfs_bandwidth(tg)->quota;
9282
9283 if (period <= 0)
9284 return -EINVAL;
9285
9286 return tg_set_cfs_bandwidth(tg, period, quota);
9287}
9288
9289long tg_get_cfs_period(struct task_group *tg)
9290{
9291 u64 cfs_period_us;
9292
9293 cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
9294 do_div(cfs_period_us, NSEC_PER_USEC);
9295
9296 return cfs_period_us;
9297}
9298
9299static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
9300{
9301 return tg_get_cfs_quota(cgroup_tg(cgrp));
9302}
9303
9304static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
9305 s64 cfs_quota_us)
9306{
9307 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
9308}
9309
9310static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
9311{
9312 return tg_get_cfs_period(cgroup_tg(cgrp));
9313}
9314
9315static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9316 u64 cfs_period_us)
9317{
9318 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
9319}
9320
9321struct cfs_schedulable_data {
9322 struct task_group *tg;
9323 u64 period, quota;
9324};
9325
9326/*
9327 * normalize group quota/period to be quota/max_period
9328 * note: units are usecs
9329 */
9330static u64 normalize_cfs_quota(struct task_group *tg,
9331 struct cfs_schedulable_data *d)
9332{
9333 u64 quota, period;
9334
9335 if (tg == d->tg) {
9336 period = d->period;
9337 quota = d->quota;
9338 } else {
9339 period = tg_get_cfs_period(tg);
9340 quota = tg_get_cfs_quota(tg);
9341 }
9342
9343 /* note: these should typically be equivalent */
9344 if (quota == RUNTIME_INF || quota == -1)
9345 return RUNTIME_INF;
9346
9347 return to_ratio(period, quota);
9348}
9349
9350static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9351{
9352 struct cfs_schedulable_data *d = data;
9353 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9354 s64 quota = 0, parent_quota = -1;
9355
9356 if (!tg->parent) {
9357 quota = RUNTIME_INF;
9358 } else {
9359 struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
9360
9361 quota = normalize_cfs_quota(tg, d);
9362 parent_quota = parent_b->hierarchal_quota;
9363
9364 /*
9365 * ensure max(child_quota) <= parent_quota, inherit when no
9366 * limit is set
9367 */
9368 if (quota == RUNTIME_INF)
9369 quota = parent_quota;
9370 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
9371 return -EINVAL;
9372 }
9373 cfs_b->hierarchal_quota = quota;
9374
9375 return 0;
9376}
9377
9378static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
9379{
9380 int ret;
9381 struct cfs_schedulable_data data = {
9382 .tg = tg,
9383 .period = period,
9384 .quota = quota,
9385 };
9386
9387 if (quota != RUNTIME_INF) {
9388 do_div(data.period, NSEC_PER_USEC);
9389 do_div(data.quota, NSEC_PER_USEC);
9390 }
9391
9392 rcu_read_lock();
9393 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
9394 rcu_read_unlock();
9395
9396 return ret;
9397}
9398
9399static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
9400 struct cgroup_map_cb *cb)
9401{
9402 struct task_group *tg = cgroup_tg(cgrp);
9403 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9404
9405 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
9406 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
9407 cb->fill(cb, "throttled_time", cfs_b->throttled_time);
9408
9409 return 0;
9410}
9411#endif /* CONFIG_CFS_BANDWIDTH */
8995#endif /* CONFIG_FAIR_GROUP_SCHED */ 9412#endif /* CONFIG_FAIR_GROUP_SCHED */
8996 9413
8997#ifdef CONFIG_RT_GROUP_SCHED 9414#ifdef CONFIG_RT_GROUP_SCHED
@@ -9026,6 +9443,22 @@ static struct cftype cpu_files[] = {
9026 .write_u64 = cpu_shares_write_u64, 9443 .write_u64 = cpu_shares_write_u64,
9027 }, 9444 },
9028#endif 9445#endif
9446#ifdef CONFIG_CFS_BANDWIDTH
9447 {
9448 .name = "cfs_quota_us",
9449 .read_s64 = cpu_cfs_quota_read_s64,
9450 .write_s64 = cpu_cfs_quota_write_s64,
9451 },
9452 {
9453 .name = "cfs_period_us",
9454 .read_u64 = cpu_cfs_period_read_u64,
9455 .write_u64 = cpu_cfs_period_write_u64,
9456 },
9457 {
9458 .name = "stat",
9459 .read_map = cpu_stats_show,
9460 },
9461#endif
9029#ifdef CONFIG_RT_GROUP_SCHED 9462#ifdef CONFIG_RT_GROUP_SCHED
9030 { 9463 {
9031 .name = "rt_runtime_us", 9464 .name = "rt_runtime_us",
@@ -9335,4 +9768,3 @@ struct cgroup_subsys cpuacct_subsys = {
9335 .subsys_id = cpuacct_subsys_id, 9768 .subsys_id = cpuacct_subsys_id,
9336}; 9769};
9337#endif /* CONFIG_CGROUP_CPUACCT */ 9770#endif /* CONFIG_CGROUP_CPUACCT */
9338