diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 666 |
1 files changed, 549 insertions, 117 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 03ad0113801a..d87c6e5d4e8c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void) | |||
196 | return sysctl_sched_rt_runtime >= 0; | 196 | return sysctl_sched_rt_runtime >= 0; |
197 | } | 197 | } |
198 | 198 | ||
199 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | 199 | static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) |
200 | { | 200 | { |
201 | ktime_t now; | 201 | unsigned long delta; |
202 | ktime_t soft, hard, now; | ||
203 | |||
204 | for (;;) { | ||
205 | if (hrtimer_active(period_timer)) | ||
206 | break; | ||
207 | |||
208 | now = hrtimer_cb_get_time(period_timer); | ||
209 | hrtimer_forward(period_timer, now, period); | ||
202 | 210 | ||
211 | soft = hrtimer_get_softexpires(period_timer); | ||
212 | hard = hrtimer_get_expires(period_timer); | ||
213 | delta = ktime_to_ns(ktime_sub(hard, soft)); | ||
214 | __hrtimer_start_range_ns(period_timer, soft, delta, | ||
215 | HRTIMER_MODE_ABS_PINNED, 0); | ||
216 | } | ||
217 | } | ||
218 | |||
219 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
220 | { | ||
203 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) | 221 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) |
204 | return; | 222 | return; |
205 | 223 | ||
@@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
207 | return; | 225 | return; |
208 | 226 | ||
209 | raw_spin_lock(&rt_b->rt_runtime_lock); | 227 | raw_spin_lock(&rt_b->rt_runtime_lock); |
210 | for (;;) { | 228 | start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); |
211 | unsigned long delta; | ||
212 | ktime_t soft, hard; | ||
213 | |||
214 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
215 | break; | ||
216 | |||
217 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); | ||
218 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); | ||
219 | |||
220 | soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); | ||
221 | hard = hrtimer_get_expires(&rt_b->rt_period_timer); | ||
222 | delta = ktime_to_ns(ktime_sub(hard, soft)); | ||
223 | __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, | ||
224 | HRTIMER_MODE_ABS_PINNED, 0); | ||
225 | } | ||
226 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 229 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
227 | } | 230 | } |
228 | 231 | ||
@@ -247,6 +250,24 @@ struct cfs_rq; | |||
247 | 250 | ||
248 | static LIST_HEAD(task_groups); | 251 | static LIST_HEAD(task_groups); |
249 | 252 | ||
253 | struct cfs_bandwidth { | ||
254 | #ifdef CONFIG_CFS_BANDWIDTH | ||
255 | raw_spinlock_t lock; | ||
256 | ktime_t period; | ||
257 | u64 quota, runtime; | ||
258 | s64 hierarchal_quota; | ||
259 | u64 runtime_expires; | ||
260 | |||
261 | int idle, timer_active; | ||
262 | struct hrtimer period_timer, slack_timer; | ||
263 | struct list_head throttled_cfs_rq; | ||
264 | |||
265 | /* statistics */ | ||
266 | int nr_periods, nr_throttled; | ||
267 | u64 throttled_time; | ||
268 | #endif | ||
269 | }; | ||
270 | |||
250 | /* task group related information */ | 271 | /* task group related information */ |
251 | struct task_group { | 272 | struct task_group { |
252 | struct cgroup_subsys_state css; | 273 | struct cgroup_subsys_state css; |
@@ -278,6 +299,8 @@ struct task_group { | |||
278 | #ifdef CONFIG_SCHED_AUTOGROUP | 299 | #ifdef CONFIG_SCHED_AUTOGROUP |
279 | struct autogroup *autogroup; | 300 | struct autogroup *autogroup; |
280 | #endif | 301 | #endif |
302 | |||
303 | struct cfs_bandwidth cfs_bandwidth; | ||
281 | }; | 304 | }; |
282 | 305 | ||
283 | /* task_group_lock serializes the addition/removal of task groups */ | 306 | /* task_group_lock serializes the addition/removal of task groups */ |
@@ -311,7 +334,7 @@ struct task_group root_task_group; | |||
311 | /* CFS-related fields in a runqueue */ | 334 | /* CFS-related fields in a runqueue */ |
312 | struct cfs_rq { | 335 | struct cfs_rq { |
313 | struct load_weight load; | 336 | struct load_weight load; |
314 | unsigned long nr_running; | 337 | unsigned long nr_running, h_nr_running; |
315 | 338 | ||
316 | u64 exec_clock; | 339 | u64 exec_clock; |
317 | u64 min_vruntime; | 340 | u64 min_vruntime; |
@@ -377,9 +400,120 @@ struct cfs_rq { | |||
377 | 400 | ||
378 | unsigned long load_contribution; | 401 | unsigned long load_contribution; |
379 | #endif | 402 | #endif |
403 | #ifdef CONFIG_CFS_BANDWIDTH | ||
404 | int runtime_enabled; | ||
405 | u64 runtime_expires; | ||
406 | s64 runtime_remaining; | ||
407 | |||
408 | u64 throttled_timestamp; | ||
409 | int throttled, throttle_count; | ||
410 | struct list_head throttled_list; | ||
411 | #endif | ||
380 | #endif | 412 | #endif |
381 | }; | 413 | }; |
382 | 414 | ||
415 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
416 | #ifdef CONFIG_CFS_BANDWIDTH | ||
417 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
418 | { | ||
419 | return &tg->cfs_bandwidth; | ||
420 | } | ||
421 | |||
422 | static inline u64 default_cfs_period(void); | ||
423 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); | ||
424 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); | ||
425 | |||
426 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) | ||
427 | { | ||
428 | struct cfs_bandwidth *cfs_b = | ||
429 | container_of(timer, struct cfs_bandwidth, slack_timer); | ||
430 | do_sched_cfs_slack_timer(cfs_b); | ||
431 | |||
432 | return HRTIMER_NORESTART; | ||
433 | } | ||
434 | |||
435 | static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | ||
436 | { | ||
437 | struct cfs_bandwidth *cfs_b = | ||
438 | container_of(timer, struct cfs_bandwidth, period_timer); | ||
439 | ktime_t now; | ||
440 | int overrun; | ||
441 | int idle = 0; | ||
442 | |||
443 | for (;;) { | ||
444 | now = hrtimer_cb_get_time(timer); | ||
445 | overrun = hrtimer_forward(timer, now, cfs_b->period); | ||
446 | |||
447 | if (!overrun) | ||
448 | break; | ||
449 | |||
450 | idle = do_sched_cfs_period_timer(cfs_b, overrun); | ||
451 | } | ||
452 | |||
453 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
454 | } | ||
455 | |||
456 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
457 | { | ||
458 | raw_spin_lock_init(&cfs_b->lock); | ||
459 | cfs_b->runtime = 0; | ||
460 | cfs_b->quota = RUNTIME_INF; | ||
461 | cfs_b->period = ns_to_ktime(default_cfs_period()); | ||
462 | |||
463 | INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); | ||
464 | hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
465 | cfs_b->period_timer.function = sched_cfs_period_timer; | ||
466 | hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
467 | cfs_b->slack_timer.function = sched_cfs_slack_timer; | ||
468 | } | ||
469 | |||
470 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
471 | { | ||
472 | cfs_rq->runtime_enabled = 0; | ||
473 | INIT_LIST_HEAD(&cfs_rq->throttled_list); | ||
474 | } | ||
475 | |||
476 | /* requires cfs_b->lock, may release to reprogram timer */ | ||
477 | static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
478 | { | ||
479 | /* | ||
480 | * The timer may be active because we're trying to set a new bandwidth | ||
481 | * period or because we're racing with the tear-down path | ||
482 | * (timer_active==0 becomes visible before the hrtimer call-back | ||
483 | * terminates). In either case we ensure that it's re-programmed | ||
484 | */ | ||
485 | while (unlikely(hrtimer_active(&cfs_b->period_timer))) { | ||
486 | raw_spin_unlock(&cfs_b->lock); | ||
487 | /* ensure cfs_b->lock is available while we wait */ | ||
488 | hrtimer_cancel(&cfs_b->period_timer); | ||
489 | |||
490 | raw_spin_lock(&cfs_b->lock); | ||
491 | /* if someone else restarted the timer then we're done */ | ||
492 | if (cfs_b->timer_active) | ||
493 | return; | ||
494 | } | ||
495 | |||
496 | cfs_b->timer_active = 1; | ||
497 | start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); | ||
498 | } | ||
499 | |||
500 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
501 | { | ||
502 | hrtimer_cancel(&cfs_b->period_timer); | ||
503 | hrtimer_cancel(&cfs_b->slack_timer); | ||
504 | } | ||
505 | #else | ||
506 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
507 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
508 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
509 | |||
510 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
511 | { | ||
512 | return NULL; | ||
513 | } | ||
514 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
515 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
516 | |||
383 | /* Real-Time classes' related field in a runqueue: */ | 517 | /* Real-Time classes' related field in a runqueue: */ |
384 | struct rt_rq { | 518 | struct rt_rq { |
385 | struct rt_prio_array active; | 519 | struct rt_prio_array active; |
@@ -510,7 +644,7 @@ struct rq { | |||
510 | 644 | ||
511 | unsigned long cpu_power; | 645 | unsigned long cpu_power; |
512 | 646 | ||
513 | unsigned char idle_at_tick; | 647 | unsigned char idle_balance; |
514 | /* For active balancing */ | 648 | /* For active balancing */ |
515 | int post_schedule; | 649 | int post_schedule; |
516 | int active_balance; | 650 | int active_balance; |
@@ -520,8 +654,6 @@ struct rq { | |||
520 | int cpu; | 654 | int cpu; |
521 | int online; | 655 | int online; |
522 | 656 | ||
523 | unsigned long avg_load_per_task; | ||
524 | |||
525 | u64 rt_avg; | 657 | u64 rt_avg; |
526 | u64 age_stamp; | 658 | u64 age_stamp; |
527 | u64 idle_stamp; | 659 | u64 idle_stamp; |
@@ -570,7 +702,7 @@ struct rq { | |||
570 | #endif | 702 | #endif |
571 | 703 | ||
572 | #ifdef CONFIG_SMP | 704 | #ifdef CONFIG_SMP |
573 | struct task_struct *wake_list; | 705 | struct llist_head wake_list; |
574 | #endif | 706 | #endif |
575 | }; | 707 | }; |
576 | 708 | ||
@@ -1272,6 +1404,18 @@ void wake_up_idle_cpu(int cpu) | |||
1272 | smp_send_reschedule(cpu); | 1404 | smp_send_reschedule(cpu); |
1273 | } | 1405 | } |
1274 | 1406 | ||
1407 | static inline bool got_nohz_idle_kick(void) | ||
1408 | { | ||
1409 | return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; | ||
1410 | } | ||
1411 | |||
1412 | #else /* CONFIG_NO_HZ */ | ||
1413 | |||
1414 | static inline bool got_nohz_idle_kick(void) | ||
1415 | { | ||
1416 | return false; | ||
1417 | } | ||
1418 | |||
1275 | #endif /* CONFIG_NO_HZ */ | 1419 | #endif /* CONFIG_NO_HZ */ |
1276 | 1420 | ||
1277 | static u64 sched_avg_period(void) | 1421 | static u64 sched_avg_period(void) |
@@ -1471,24 +1615,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) | |||
1471 | update_load_sub(&rq->load, load); | 1615 | update_load_sub(&rq->load, load); |
1472 | } | 1616 | } |
1473 | 1617 | ||
1474 | #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) | 1618 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ |
1619 | (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) | ||
1475 | typedef int (*tg_visitor)(struct task_group *, void *); | 1620 | typedef int (*tg_visitor)(struct task_group *, void *); |
1476 | 1621 | ||
1477 | /* | 1622 | /* |
1478 | * Iterate the full tree, calling @down when first entering a node and @up when | 1623 | * Iterate task_group tree rooted at *from, calling @down when first entering a |
1479 | * leaving it for the final time. | 1624 | * node and @up when leaving it for the final time. |
1625 | * | ||
1626 | * Caller must hold rcu_lock or sufficient equivalent. | ||
1480 | */ | 1627 | */ |
1481 | static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | 1628 | static int walk_tg_tree_from(struct task_group *from, |
1629 | tg_visitor down, tg_visitor up, void *data) | ||
1482 | { | 1630 | { |
1483 | struct task_group *parent, *child; | 1631 | struct task_group *parent, *child; |
1484 | int ret; | 1632 | int ret; |
1485 | 1633 | ||
1486 | rcu_read_lock(); | 1634 | parent = from; |
1487 | parent = &root_task_group; | 1635 | |
1488 | down: | 1636 | down: |
1489 | ret = (*down)(parent, data); | 1637 | ret = (*down)(parent, data); |
1490 | if (ret) | 1638 | if (ret) |
1491 | goto out_unlock; | 1639 | goto out; |
1492 | list_for_each_entry_rcu(child, &parent->children, siblings) { | 1640 | list_for_each_entry_rcu(child, &parent->children, siblings) { |
1493 | parent = child; | 1641 | parent = child; |
1494 | goto down; | 1642 | goto down; |
@@ -1497,19 +1645,29 @@ up: | |||
1497 | continue; | 1645 | continue; |
1498 | } | 1646 | } |
1499 | ret = (*up)(parent, data); | 1647 | ret = (*up)(parent, data); |
1500 | if (ret) | 1648 | if (ret || parent == from) |
1501 | goto out_unlock; | 1649 | goto out; |
1502 | 1650 | ||
1503 | child = parent; | 1651 | child = parent; |
1504 | parent = parent->parent; | 1652 | parent = parent->parent; |
1505 | if (parent) | 1653 | if (parent) |
1506 | goto up; | 1654 | goto up; |
1507 | out_unlock: | 1655 | out: |
1508 | rcu_read_unlock(); | ||
1509 | |||
1510 | return ret; | 1656 | return ret; |
1511 | } | 1657 | } |
1512 | 1658 | ||
1659 | /* | ||
1660 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
1661 | * leaving it for the final time. | ||
1662 | * | ||
1663 | * Caller must hold rcu_lock or sufficient equivalent. | ||
1664 | */ | ||
1665 | |||
1666 | static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | ||
1667 | { | ||
1668 | return walk_tg_tree_from(&root_task_group, down, up, data); | ||
1669 | } | ||
1670 | |||
1513 | static int tg_nop(struct task_group *tg, void *data) | 1671 | static int tg_nop(struct task_group *tg, void *data) |
1514 | { | 1672 | { |
1515 | return 0; | 1673 | return 0; |
@@ -1569,11 +1727,9 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1569 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | 1727 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); |
1570 | 1728 | ||
1571 | if (nr_running) | 1729 | if (nr_running) |
1572 | rq->avg_load_per_task = rq->load.weight / nr_running; | 1730 | return rq->load.weight / nr_running; |
1573 | else | ||
1574 | rq->avg_load_per_task = 0; | ||
1575 | 1731 | ||
1576 | return rq->avg_load_per_task; | 1732 | return 0; |
1577 | } | 1733 | } |
1578 | 1734 | ||
1579 | #ifdef CONFIG_PREEMPT | 1735 | #ifdef CONFIG_PREEMPT |
@@ -1806,7 +1962,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1806 | rq->nr_uninterruptible--; | 1962 | rq->nr_uninterruptible--; |
1807 | 1963 | ||
1808 | enqueue_task(rq, p, flags); | 1964 | enqueue_task(rq, p, flags); |
1809 | inc_nr_running(rq); | ||
1810 | } | 1965 | } |
1811 | 1966 | ||
1812 | /* | 1967 | /* |
@@ -1818,7 +1973,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1818 | rq->nr_uninterruptible++; | 1973 | rq->nr_uninterruptible++; |
1819 | 1974 | ||
1820 | dequeue_task(rq, p, flags); | 1975 | dequeue_task(rq, p, flags); |
1821 | dec_nr_running(rq); | ||
1822 | } | 1976 | } |
1823 | 1977 | ||
1824 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 1978 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
@@ -2390,11 +2544,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2390 | 2544 | ||
2391 | /* Look for allowed, online CPU in same node. */ | 2545 | /* Look for allowed, online CPU in same node. */ |
2392 | for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) | 2546 | for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) |
2393 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 2547 | if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) |
2394 | return dest_cpu; | 2548 | return dest_cpu; |
2395 | 2549 | ||
2396 | /* Any allowed, online CPU? */ | 2550 | /* Any allowed, online CPU? */ |
2397 | dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); | 2551 | dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); |
2398 | if (dest_cpu < nr_cpu_ids) | 2552 | if (dest_cpu < nr_cpu_ids) |
2399 | return dest_cpu; | 2553 | return dest_cpu; |
2400 | 2554 | ||
@@ -2431,7 +2585,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) | |||
2431 | * [ this allows ->select_task() to simply return task_cpu(p) and | 2585 | * [ this allows ->select_task() to simply return task_cpu(p) and |
2432 | * not worry about this generic constraint ] | 2586 | * not worry about this generic constraint ] |
2433 | */ | 2587 | */ |
2434 | if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || | 2588 | if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || |
2435 | !cpu_online(cpu))) | 2589 | !cpu_online(cpu))) |
2436 | cpu = select_fallback_rq(task_cpu(p), p); | 2590 | cpu = select_fallback_rq(task_cpu(p), p); |
2437 | 2591 | ||
@@ -2556,42 +2710,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) | |||
2556 | } | 2710 | } |
2557 | 2711 | ||
2558 | #ifdef CONFIG_SMP | 2712 | #ifdef CONFIG_SMP |
2559 | static void sched_ttwu_do_pending(struct task_struct *list) | 2713 | static void sched_ttwu_pending(void) |
2560 | { | 2714 | { |
2561 | struct rq *rq = this_rq(); | 2715 | struct rq *rq = this_rq(); |
2716 | struct llist_node *llist = llist_del_all(&rq->wake_list); | ||
2717 | struct task_struct *p; | ||
2562 | 2718 | ||
2563 | raw_spin_lock(&rq->lock); | 2719 | raw_spin_lock(&rq->lock); |
2564 | 2720 | ||
2565 | while (list) { | 2721 | while (llist) { |
2566 | struct task_struct *p = list; | 2722 | p = llist_entry(llist, struct task_struct, wake_entry); |
2567 | list = list->wake_entry; | 2723 | llist = llist_next(llist); |
2568 | ttwu_do_activate(rq, p, 0); | 2724 | ttwu_do_activate(rq, p, 0); |
2569 | } | 2725 | } |
2570 | 2726 | ||
2571 | raw_spin_unlock(&rq->lock); | 2727 | raw_spin_unlock(&rq->lock); |
2572 | } | 2728 | } |
2573 | 2729 | ||
2574 | #ifdef CONFIG_HOTPLUG_CPU | ||
2575 | |||
2576 | static void sched_ttwu_pending(void) | ||
2577 | { | ||
2578 | struct rq *rq = this_rq(); | ||
2579 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2580 | |||
2581 | if (!list) | ||
2582 | return; | ||
2583 | |||
2584 | sched_ttwu_do_pending(list); | ||
2585 | } | ||
2586 | |||
2587 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
2588 | |||
2589 | void scheduler_ipi(void) | 2730 | void scheduler_ipi(void) |
2590 | { | 2731 | { |
2591 | struct rq *rq = this_rq(); | 2732 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) |
2592 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2593 | |||
2594 | if (!list) | ||
2595 | return; | 2733 | return; |
2596 | 2734 | ||
2597 | /* | 2735 | /* |
@@ -2608,25 +2746,21 @@ void scheduler_ipi(void) | |||
2608 | * somewhat pessimize the simple resched case. | 2746 | * somewhat pessimize the simple resched case. |
2609 | */ | 2747 | */ |
2610 | irq_enter(); | 2748 | irq_enter(); |
2611 | sched_ttwu_do_pending(list); | 2749 | sched_ttwu_pending(); |
2750 | |||
2751 | /* | ||
2752 | * Check if someone kicked us for doing the nohz idle load balance. | ||
2753 | */ | ||
2754 | if (unlikely(got_nohz_idle_kick() && !need_resched())) { | ||
2755 | this_rq()->idle_balance = 1; | ||
2756 | raise_softirq_irqoff(SCHED_SOFTIRQ); | ||
2757 | } | ||
2612 | irq_exit(); | 2758 | irq_exit(); |
2613 | } | 2759 | } |
2614 | 2760 | ||
2615 | static void ttwu_queue_remote(struct task_struct *p, int cpu) | 2761 | static void ttwu_queue_remote(struct task_struct *p, int cpu) |
2616 | { | 2762 | { |
2617 | struct rq *rq = cpu_rq(cpu); | 2763 | if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) |
2618 | struct task_struct *next = rq->wake_list; | ||
2619 | |||
2620 | for (;;) { | ||
2621 | struct task_struct *old = next; | ||
2622 | |||
2623 | p->wake_entry = next; | ||
2624 | next = cmpxchg(&rq->wake_list, old, p); | ||
2625 | if (next == old) | ||
2626 | break; | ||
2627 | } | ||
2628 | |||
2629 | if (!next) | ||
2630 | smp_send_reschedule(cpu); | 2764 | smp_send_reschedule(cpu); |
2631 | } | 2765 | } |
2632 | 2766 | ||
@@ -2848,19 +2982,23 @@ void sched_fork(struct task_struct *p) | |||
2848 | p->state = TASK_RUNNING; | 2982 | p->state = TASK_RUNNING; |
2849 | 2983 | ||
2850 | /* | 2984 | /* |
2985 | * Make sure we do not leak PI boosting priority to the child. | ||
2986 | */ | ||
2987 | p->prio = current->normal_prio; | ||
2988 | |||
2989 | /* | ||
2851 | * Revert to default priority/policy on fork if requested. | 2990 | * Revert to default priority/policy on fork if requested. |
2852 | */ | 2991 | */ |
2853 | if (unlikely(p->sched_reset_on_fork)) { | 2992 | if (unlikely(p->sched_reset_on_fork)) { |
2854 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { | 2993 | if (task_has_rt_policy(p)) { |
2855 | p->policy = SCHED_NORMAL; | 2994 | p->policy = SCHED_NORMAL; |
2856 | p->normal_prio = p->static_prio; | ||
2857 | } | ||
2858 | |||
2859 | if (PRIO_TO_NICE(p->static_prio) < 0) { | ||
2860 | p->static_prio = NICE_TO_PRIO(0); | 2995 | p->static_prio = NICE_TO_PRIO(0); |
2861 | p->normal_prio = p->static_prio; | 2996 | p->rt_priority = 0; |
2862 | set_load_weight(p); | 2997 | } else if (PRIO_TO_NICE(p->static_prio) < 0) |
2863 | } | 2998 | p->static_prio = NICE_TO_PRIO(0); |
2999 | |||
3000 | p->prio = p->normal_prio = __normal_prio(p); | ||
3001 | set_load_weight(p); | ||
2864 | 3002 | ||
2865 | /* | 3003 | /* |
2866 | * We don't need the reset flag anymore after the fork. It has | 3004 | * We don't need the reset flag anymore after the fork. It has |
@@ -2869,11 +3007,6 @@ void sched_fork(struct task_struct *p) | |||
2869 | p->sched_reset_on_fork = 0; | 3007 | p->sched_reset_on_fork = 0; |
2870 | } | 3008 | } |
2871 | 3009 | ||
2872 | /* | ||
2873 | * Make sure we do not leak PI boosting priority to the child. | ||
2874 | */ | ||
2875 | p->prio = current->normal_prio; | ||
2876 | |||
2877 | if (!rt_prio(p->prio)) | 3010 | if (!rt_prio(p->prio)) |
2878 | p->sched_class = &fair_sched_class; | 3011 | p->sched_class = &fair_sched_class; |
2879 | 3012 | ||
@@ -4116,7 +4249,7 @@ void scheduler_tick(void) | |||
4116 | perf_event_task_tick(); | 4249 | perf_event_task_tick(); |
4117 | 4250 | ||
4118 | #ifdef CONFIG_SMP | 4251 | #ifdef CONFIG_SMP |
4119 | rq->idle_at_tick = idle_cpu(cpu); | 4252 | rq->idle_balance = idle_cpu(cpu); |
4120 | trigger_load_balance(rq, cpu); | 4253 | trigger_load_balance(rq, cpu); |
4121 | #endif | 4254 | #endif |
4122 | } | 4255 | } |
@@ -4240,7 +4373,7 @@ pick_next_task(struct rq *rq) | |||
4240 | * Optimization: we know that if all tasks are in | 4373 | * Optimization: we know that if all tasks are in |
4241 | * the fair class we can call that function directly: | 4374 | * the fair class we can call that function directly: |
4242 | */ | 4375 | */ |
4243 | if (likely(rq->nr_running == rq->cfs.nr_running)) { | 4376 | if (likely(rq->nr_running == rq->cfs.h_nr_running)) { |
4244 | p = fair_sched_class.pick_next_task(rq); | 4377 | p = fair_sched_class.pick_next_task(rq); |
4245 | if (likely(p)) | 4378 | if (likely(p)) |
4246 | return p; | 4379 | return p; |
@@ -5026,7 +5159,20 @@ EXPORT_SYMBOL(task_nice); | |||
5026 | */ | 5159 | */ |
5027 | int idle_cpu(int cpu) | 5160 | int idle_cpu(int cpu) |
5028 | { | 5161 | { |
5029 | return cpu_curr(cpu) == cpu_rq(cpu)->idle; | 5162 | struct rq *rq = cpu_rq(cpu); |
5163 | |||
5164 | if (rq->curr != rq->idle) | ||
5165 | return 0; | ||
5166 | |||
5167 | if (rq->nr_running) | ||
5168 | return 0; | ||
5169 | |||
5170 | #ifdef CONFIG_SMP | ||
5171 | if (!llist_empty(&rq->wake_list)) | ||
5172 | return 0; | ||
5173 | #endif | ||
5174 | |||
5175 | return 1; | ||
5030 | } | 5176 | } |
5031 | 5177 | ||
5032 | /** | 5178 | /** |
@@ -5876,7 +6022,7 @@ void show_state_filter(unsigned long state_filter) | |||
5876 | printk(KERN_INFO | 6022 | printk(KERN_INFO |
5877 | " task PC stack pid father\n"); | 6023 | " task PC stack pid father\n"); |
5878 | #endif | 6024 | #endif |
5879 | read_lock(&tasklist_lock); | 6025 | rcu_read_lock(); |
5880 | do_each_thread(g, p) { | 6026 | do_each_thread(g, p) { |
5881 | /* | 6027 | /* |
5882 | * reset the NMI-timeout, listing all files on a slow | 6028 | * reset the NMI-timeout, listing all files on a slow |
@@ -5892,7 +6038,7 @@ void show_state_filter(unsigned long state_filter) | |||
5892 | #ifdef CONFIG_SCHED_DEBUG | 6038 | #ifdef CONFIG_SCHED_DEBUG |
5893 | sysrq_sched_debug_show(); | 6039 | sysrq_sched_debug_show(); |
5894 | #endif | 6040 | #endif |
5895 | read_unlock(&tasklist_lock); | 6041 | rcu_read_unlock(); |
5896 | /* | 6042 | /* |
5897 | * Only show locks if all tasks are dumped: | 6043 | * Only show locks if all tasks are dumped: |
5898 | */ | 6044 | */ |
@@ -6007,10 +6153,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
6007 | { | 6153 | { |
6008 | if (p->sched_class && p->sched_class->set_cpus_allowed) | 6154 | if (p->sched_class && p->sched_class->set_cpus_allowed) |
6009 | p->sched_class->set_cpus_allowed(p, new_mask); | 6155 | p->sched_class->set_cpus_allowed(p, new_mask); |
6010 | else { | 6156 | |
6011 | cpumask_copy(&p->cpus_allowed, new_mask); | 6157 | cpumask_copy(&p->cpus_allowed, new_mask); |
6012 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | 6158 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); |
6013 | } | ||
6014 | } | 6159 | } |
6015 | 6160 | ||
6016 | /* | 6161 | /* |
@@ -6108,7 +6253,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
6108 | if (task_cpu(p) != src_cpu) | 6253 | if (task_cpu(p) != src_cpu) |
6109 | goto done; | 6254 | goto done; |
6110 | /* Affinity changed (again). */ | 6255 | /* Affinity changed (again). */ |
6111 | if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 6256 | if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) |
6112 | goto fail; | 6257 | goto fail; |
6113 | 6258 | ||
6114 | /* | 6259 | /* |
@@ -6189,6 +6334,30 @@ static void calc_global_load_remove(struct rq *rq) | |||
6189 | rq->calc_load_active = 0; | 6334 | rq->calc_load_active = 0; |
6190 | } | 6335 | } |
6191 | 6336 | ||
6337 | #ifdef CONFIG_CFS_BANDWIDTH | ||
6338 | static void unthrottle_offline_cfs_rqs(struct rq *rq) | ||
6339 | { | ||
6340 | struct cfs_rq *cfs_rq; | ||
6341 | |||
6342 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
6343 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
6344 | |||
6345 | if (!cfs_rq->runtime_enabled) | ||
6346 | continue; | ||
6347 | |||
6348 | /* | ||
6349 | * clock_task is not advancing so we just need to make sure | ||
6350 | * there's some valid quota amount | ||
6351 | */ | ||
6352 | cfs_rq->runtime_remaining = cfs_b->quota; | ||
6353 | if (cfs_rq_throttled(cfs_rq)) | ||
6354 | unthrottle_cfs_rq(cfs_rq); | ||
6355 | } | ||
6356 | } | ||
6357 | #else | ||
6358 | static void unthrottle_offline_cfs_rqs(struct rq *rq) {} | ||
6359 | #endif | ||
6360 | |||
6192 | /* | 6361 | /* |
6193 | * Migrate all tasks from the rq, sleeping tasks will be migrated by | 6362 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
6194 | * try_to_wake_up()->select_task_rq(). | 6363 | * try_to_wake_up()->select_task_rq(). |
@@ -6214,6 +6383,9 @@ static void migrate_tasks(unsigned int dead_cpu) | |||
6214 | */ | 6383 | */ |
6215 | rq->stop = NULL; | 6384 | rq->stop = NULL; |
6216 | 6385 | ||
6386 | /* Ensure any throttled groups are reachable by pick_next_task */ | ||
6387 | unthrottle_offline_cfs_rqs(rq); | ||
6388 | |||
6217 | for ( ; ; ) { | 6389 | for ( ; ; ) { |
6218 | /* | 6390 | /* |
6219 | * There's this thread running, bail when that's the only | 6391 | * There's this thread running, bail when that's the only |
@@ -7957,6 +8129,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
7957 | /* allow initial update_cfs_load() to truncate */ | 8129 | /* allow initial update_cfs_load() to truncate */ |
7958 | cfs_rq->load_stamp = 1; | 8130 | cfs_rq->load_stamp = 1; |
7959 | #endif | 8131 | #endif |
8132 | init_cfs_rq_runtime(cfs_rq); | ||
7960 | 8133 | ||
7961 | tg->cfs_rq[cpu] = cfs_rq; | 8134 | tg->cfs_rq[cpu] = cfs_rq; |
7962 | tg->se[cpu] = se; | 8135 | tg->se[cpu] = se; |
@@ -8096,6 +8269,7 @@ void __init sched_init(void) | |||
8096 | * We achieve this by letting root_task_group's tasks sit | 8269 | * We achieve this by letting root_task_group's tasks sit |
8097 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). | 8270 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). |
8098 | */ | 8271 | */ |
8272 | init_cfs_bandwidth(&root_task_group.cfs_bandwidth); | ||
8099 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); | 8273 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); |
8100 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8274 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8101 | 8275 | ||
@@ -8125,7 +8299,6 @@ void __init sched_init(void) | |||
8125 | rq_attach_root(rq, &def_root_domain); | 8299 | rq_attach_root(rq, &def_root_domain); |
8126 | #ifdef CONFIG_NO_HZ | 8300 | #ifdef CONFIG_NO_HZ |
8127 | rq->nohz_balance_kick = 0; | 8301 | rq->nohz_balance_kick = 0; |
8128 | init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); | ||
8129 | #endif | 8302 | #endif |
8130 | #endif | 8303 | #endif |
8131 | init_rq_hrtick(rq); | 8304 | init_rq_hrtick(rq); |
@@ -8336,6 +8509,8 @@ static void free_fair_sched_group(struct task_group *tg) | |||
8336 | { | 8509 | { |
8337 | int i; | 8510 | int i; |
8338 | 8511 | ||
8512 | destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
8513 | |||
8339 | for_each_possible_cpu(i) { | 8514 | for_each_possible_cpu(i) { |
8340 | if (tg->cfs_rq) | 8515 | if (tg->cfs_rq) |
8341 | kfree(tg->cfs_rq[i]); | 8516 | kfree(tg->cfs_rq[i]); |
@@ -8363,6 +8538,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8363 | 8538 | ||
8364 | tg->shares = NICE_0_LOAD; | 8539 | tg->shares = NICE_0_LOAD; |
8365 | 8540 | ||
8541 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
8542 | |||
8366 | for_each_possible_cpu(i) { | 8543 | for_each_possible_cpu(i) { |
8367 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | 8544 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
8368 | GFP_KERNEL, cpu_to_node(i)); | 8545 | GFP_KERNEL, cpu_to_node(i)); |
@@ -8638,12 +8815,7 @@ unsigned long sched_group_shares(struct task_group *tg) | |||
8638 | } | 8815 | } |
8639 | #endif | 8816 | #endif |
8640 | 8817 | ||
8641 | #ifdef CONFIG_RT_GROUP_SCHED | 8818 | #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) |
8642 | /* | ||
8643 | * Ensure that the real time constraints are schedulable. | ||
8644 | */ | ||
8645 | static DEFINE_MUTEX(rt_constraints_mutex); | ||
8646 | |||
8647 | static unsigned long to_ratio(u64 period, u64 runtime) | 8819 | static unsigned long to_ratio(u64 period, u64 runtime) |
8648 | { | 8820 | { |
8649 | if (runtime == RUNTIME_INF) | 8821 | if (runtime == RUNTIME_INF) |
@@ -8651,6 +8823,13 @@ static unsigned long to_ratio(u64 period, u64 runtime) | |||
8651 | 8823 | ||
8652 | return div64_u64(runtime << 20, period); | 8824 | return div64_u64(runtime << 20, period); |
8653 | } | 8825 | } |
8826 | #endif | ||
8827 | |||
8828 | #ifdef CONFIG_RT_GROUP_SCHED | ||
8829 | /* | ||
8830 | * Ensure that the real time constraints are schedulable. | ||
8831 | */ | ||
8832 | static DEFINE_MUTEX(rt_constraints_mutex); | ||
8654 | 8833 | ||
8655 | /* Must be called with tasklist_lock held */ | 8834 | /* Must be called with tasklist_lock held */ |
8656 | static inline int tg_has_rt_tasks(struct task_group *tg) | 8835 | static inline int tg_has_rt_tasks(struct task_group *tg) |
@@ -8671,7 +8850,7 @@ struct rt_schedulable_data { | |||
8671 | u64 rt_runtime; | 8850 | u64 rt_runtime; |
8672 | }; | 8851 | }; |
8673 | 8852 | ||
8674 | static int tg_schedulable(struct task_group *tg, void *data) | 8853 | static int tg_rt_schedulable(struct task_group *tg, void *data) |
8675 | { | 8854 | { |
8676 | struct rt_schedulable_data *d = data; | 8855 | struct rt_schedulable_data *d = data; |
8677 | struct task_group *child; | 8856 | struct task_group *child; |
@@ -8729,16 +8908,22 @@ static int tg_schedulable(struct task_group *tg, void *data) | |||
8729 | 8908 | ||
8730 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8909 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
8731 | { | 8910 | { |
8911 | int ret; | ||
8912 | |||
8732 | struct rt_schedulable_data data = { | 8913 | struct rt_schedulable_data data = { |
8733 | .tg = tg, | 8914 | .tg = tg, |
8734 | .rt_period = period, | 8915 | .rt_period = period, |
8735 | .rt_runtime = runtime, | 8916 | .rt_runtime = runtime, |
8736 | }; | 8917 | }; |
8737 | 8918 | ||
8738 | return walk_tg_tree(tg_schedulable, tg_nop, &data); | 8919 | rcu_read_lock(); |
8920 | ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); | ||
8921 | rcu_read_unlock(); | ||
8922 | |||
8923 | return ret; | ||
8739 | } | 8924 | } |
8740 | 8925 | ||
8741 | static int tg_set_bandwidth(struct task_group *tg, | 8926 | static int tg_set_rt_bandwidth(struct task_group *tg, |
8742 | u64 rt_period, u64 rt_runtime) | 8927 | u64 rt_period, u64 rt_runtime) |
8743 | { | 8928 | { |
8744 | int i, err = 0; | 8929 | int i, err = 0; |
@@ -8777,7 +8962,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | |||
8777 | if (rt_runtime_us < 0) | 8962 | if (rt_runtime_us < 0) |
8778 | rt_runtime = RUNTIME_INF; | 8963 | rt_runtime = RUNTIME_INF; |
8779 | 8964 | ||
8780 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8965 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
8781 | } | 8966 | } |
8782 | 8967 | ||
8783 | long sched_group_rt_runtime(struct task_group *tg) | 8968 | long sched_group_rt_runtime(struct task_group *tg) |
@@ -8802,7 +8987,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | |||
8802 | if (rt_period == 0) | 8987 | if (rt_period == 0) |
8803 | return -EINVAL; | 8988 | return -EINVAL; |
8804 | 8989 | ||
8805 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8990 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
8806 | } | 8991 | } |
8807 | 8992 | ||
8808 | long sched_group_rt_period(struct task_group *tg) | 8993 | long sched_group_rt_period(struct task_group *tg) |
@@ -8992,6 +9177,238 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | |||
8992 | 9177 | ||
8993 | return (u64) scale_load_down(tg->shares); | 9178 | return (u64) scale_load_down(tg->shares); |
8994 | } | 9179 | } |
9180 | |||
9181 | #ifdef CONFIG_CFS_BANDWIDTH | ||
9182 | static DEFINE_MUTEX(cfs_constraints_mutex); | ||
9183 | |||
9184 | const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ | ||
9185 | const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ | ||
9186 | |||
9187 | static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); | ||
9188 | |||
9189 | static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | ||
9190 | { | ||
9191 | int i, ret = 0, runtime_enabled; | ||
9192 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
9193 | |||
9194 | if (tg == &root_task_group) | ||
9195 | return -EINVAL; | ||
9196 | |||
9197 | /* | ||
9198 | * Ensure we have at some amount of bandwidth every period. This is | ||
9199 | * to prevent reaching a state of large arrears when throttled via | ||
9200 | * entity_tick() resulting in prolonged exit starvation. | ||
9201 | */ | ||
9202 | if (quota < min_cfs_quota_period || period < min_cfs_quota_period) | ||
9203 | return -EINVAL; | ||
9204 | |||
9205 | /* | ||
9206 | * Likewise, bound things on the otherside by preventing insane quota | ||
9207 | * periods. This also allows us to normalize in computing quota | ||
9208 | * feasibility. | ||
9209 | */ | ||
9210 | if (period > max_cfs_quota_period) | ||
9211 | return -EINVAL; | ||
9212 | |||
9213 | mutex_lock(&cfs_constraints_mutex); | ||
9214 | ret = __cfs_schedulable(tg, period, quota); | ||
9215 | if (ret) | ||
9216 | goto out_unlock; | ||
9217 | |||
9218 | runtime_enabled = quota != RUNTIME_INF; | ||
9219 | raw_spin_lock_irq(&cfs_b->lock); | ||
9220 | cfs_b->period = ns_to_ktime(period); | ||
9221 | cfs_b->quota = quota; | ||
9222 | |||
9223 | __refill_cfs_bandwidth_runtime(cfs_b); | ||
9224 | /* restart the period timer (if active) to handle new period expiry */ | ||
9225 | if (runtime_enabled && cfs_b->timer_active) { | ||
9226 | /* force a reprogram */ | ||
9227 | cfs_b->timer_active = 0; | ||
9228 | __start_cfs_bandwidth(cfs_b); | ||
9229 | } | ||
9230 | raw_spin_unlock_irq(&cfs_b->lock); | ||
9231 | |||
9232 | for_each_possible_cpu(i) { | ||
9233 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; | ||
9234 | struct rq *rq = rq_of(cfs_rq); | ||
9235 | |||
9236 | raw_spin_lock_irq(&rq->lock); | ||
9237 | cfs_rq->runtime_enabled = runtime_enabled; | ||
9238 | cfs_rq->runtime_remaining = 0; | ||
9239 | |||
9240 | if (cfs_rq_throttled(cfs_rq)) | ||
9241 | unthrottle_cfs_rq(cfs_rq); | ||
9242 | raw_spin_unlock_irq(&rq->lock); | ||
9243 | } | ||
9244 | out_unlock: | ||
9245 | mutex_unlock(&cfs_constraints_mutex); | ||
9246 | |||
9247 | return ret; | ||
9248 | } | ||
9249 | |||
9250 | int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) | ||
9251 | { | ||
9252 | u64 quota, period; | ||
9253 | |||
9254 | period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | ||
9255 | if (cfs_quota_us < 0) | ||
9256 | quota = RUNTIME_INF; | ||
9257 | else | ||
9258 | quota = (u64)cfs_quota_us * NSEC_PER_USEC; | ||
9259 | |||
9260 | return tg_set_cfs_bandwidth(tg, period, quota); | ||
9261 | } | ||
9262 | |||
9263 | long tg_get_cfs_quota(struct task_group *tg) | ||
9264 | { | ||
9265 | u64 quota_us; | ||
9266 | |||
9267 | if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) | ||
9268 | return -1; | ||
9269 | |||
9270 | quota_us = tg_cfs_bandwidth(tg)->quota; | ||
9271 | do_div(quota_us, NSEC_PER_USEC); | ||
9272 | |||
9273 | return quota_us; | ||
9274 | } | ||
9275 | |||
9276 | int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) | ||
9277 | { | ||
9278 | u64 quota, period; | ||
9279 | |||
9280 | period = (u64)cfs_period_us * NSEC_PER_USEC; | ||
9281 | quota = tg_cfs_bandwidth(tg)->quota; | ||
9282 | |||
9283 | if (period <= 0) | ||
9284 | return -EINVAL; | ||
9285 | |||
9286 | return tg_set_cfs_bandwidth(tg, period, quota); | ||
9287 | } | ||
9288 | |||
9289 | long tg_get_cfs_period(struct task_group *tg) | ||
9290 | { | ||
9291 | u64 cfs_period_us; | ||
9292 | |||
9293 | cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | ||
9294 | do_div(cfs_period_us, NSEC_PER_USEC); | ||
9295 | |||
9296 | return cfs_period_us; | ||
9297 | } | ||
9298 | |||
9299 | static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) | ||
9300 | { | ||
9301 | return tg_get_cfs_quota(cgroup_tg(cgrp)); | ||
9302 | } | ||
9303 | |||
9304 | static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, | ||
9305 | s64 cfs_quota_us) | ||
9306 | { | ||
9307 | return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); | ||
9308 | } | ||
9309 | |||
9310 | static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) | ||
9311 | { | ||
9312 | return tg_get_cfs_period(cgroup_tg(cgrp)); | ||
9313 | } | ||
9314 | |||
9315 | static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, | ||
9316 | u64 cfs_period_us) | ||
9317 | { | ||
9318 | return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); | ||
9319 | } | ||
9320 | |||
9321 | struct cfs_schedulable_data { | ||
9322 | struct task_group *tg; | ||
9323 | u64 period, quota; | ||
9324 | }; | ||
9325 | |||
9326 | /* | ||
9327 | * normalize group quota/period to be quota/max_period | ||
9328 | * note: units are usecs | ||
9329 | */ | ||
9330 | static u64 normalize_cfs_quota(struct task_group *tg, | ||
9331 | struct cfs_schedulable_data *d) | ||
9332 | { | ||
9333 | u64 quota, period; | ||
9334 | |||
9335 | if (tg == d->tg) { | ||
9336 | period = d->period; | ||
9337 | quota = d->quota; | ||
9338 | } else { | ||
9339 | period = tg_get_cfs_period(tg); | ||
9340 | quota = tg_get_cfs_quota(tg); | ||
9341 | } | ||
9342 | |||
9343 | /* note: these should typically be equivalent */ | ||
9344 | if (quota == RUNTIME_INF || quota == -1) | ||
9345 | return RUNTIME_INF; | ||
9346 | |||
9347 | return to_ratio(period, quota); | ||
9348 | } | ||
9349 | |||
9350 | static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | ||
9351 | { | ||
9352 | struct cfs_schedulable_data *d = data; | ||
9353 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
9354 | s64 quota = 0, parent_quota = -1; | ||
9355 | |||
9356 | if (!tg->parent) { | ||
9357 | quota = RUNTIME_INF; | ||
9358 | } else { | ||
9359 | struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); | ||
9360 | |||
9361 | quota = normalize_cfs_quota(tg, d); | ||
9362 | parent_quota = parent_b->hierarchal_quota; | ||
9363 | |||
9364 | /* | ||
9365 | * ensure max(child_quota) <= parent_quota, inherit when no | ||
9366 | * limit is set | ||
9367 | */ | ||
9368 | if (quota == RUNTIME_INF) | ||
9369 | quota = parent_quota; | ||
9370 | else if (parent_quota != RUNTIME_INF && quota > parent_quota) | ||
9371 | return -EINVAL; | ||
9372 | } | ||
9373 | cfs_b->hierarchal_quota = quota; | ||
9374 | |||
9375 | return 0; | ||
9376 | } | ||
9377 | |||
9378 | static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) | ||
9379 | { | ||
9380 | int ret; | ||
9381 | struct cfs_schedulable_data data = { | ||
9382 | .tg = tg, | ||
9383 | .period = period, | ||
9384 | .quota = quota, | ||
9385 | }; | ||
9386 | |||
9387 | if (quota != RUNTIME_INF) { | ||
9388 | do_div(data.period, NSEC_PER_USEC); | ||
9389 | do_div(data.quota, NSEC_PER_USEC); | ||
9390 | } | ||
9391 | |||
9392 | rcu_read_lock(); | ||
9393 | ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); | ||
9394 | rcu_read_unlock(); | ||
9395 | |||
9396 | return ret; | ||
9397 | } | ||
9398 | |||
9399 | static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, | ||
9400 | struct cgroup_map_cb *cb) | ||
9401 | { | ||
9402 | struct task_group *tg = cgroup_tg(cgrp); | ||
9403 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
9404 | |||
9405 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); | ||
9406 | cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); | ||
9407 | cb->fill(cb, "throttled_time", cfs_b->throttled_time); | ||
9408 | |||
9409 | return 0; | ||
9410 | } | ||
9411 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
8995 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 9412 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8996 | 9413 | ||
8997 | #ifdef CONFIG_RT_GROUP_SCHED | 9414 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -9026,6 +9443,22 @@ static struct cftype cpu_files[] = { | |||
9026 | .write_u64 = cpu_shares_write_u64, | 9443 | .write_u64 = cpu_shares_write_u64, |
9027 | }, | 9444 | }, |
9028 | #endif | 9445 | #endif |
9446 | #ifdef CONFIG_CFS_BANDWIDTH | ||
9447 | { | ||
9448 | .name = "cfs_quota_us", | ||
9449 | .read_s64 = cpu_cfs_quota_read_s64, | ||
9450 | .write_s64 = cpu_cfs_quota_write_s64, | ||
9451 | }, | ||
9452 | { | ||
9453 | .name = "cfs_period_us", | ||
9454 | .read_u64 = cpu_cfs_period_read_u64, | ||
9455 | .write_u64 = cpu_cfs_period_write_u64, | ||
9456 | }, | ||
9457 | { | ||
9458 | .name = "stat", | ||
9459 | .read_map = cpu_stats_show, | ||
9460 | }, | ||
9461 | #endif | ||
9029 | #ifdef CONFIG_RT_GROUP_SCHED | 9462 | #ifdef CONFIG_RT_GROUP_SCHED |
9030 | { | 9463 | { |
9031 | .name = "rt_runtime_us", | 9464 | .name = "rt_runtime_us", |
@@ -9335,4 +9768,3 @@ struct cgroup_subsys cpuacct_subsys = { | |||
9335 | .subsys_id = cpuacct_subsys_id, | 9768 | .subsys_id = cpuacct_subsys_id, |
9336 | }; | 9769 | }; |
9337 | #endif /* CONFIG_CGROUP_CPUACCT */ | 9770 | #endif /* CONFIG_CGROUP_CPUACCT */ |
9338 | |||