diff options
author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2012-01-09 02:38:23 -0500 |
---|---|---|
committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2012-01-09 02:38:23 -0500 |
commit | da733563be5a9da26fe81d9f007262d00b846e22 (patch) | |
tree | db28291df94a2043af2123911984c5c173da4e6f /kernel/sched.c | |
parent | 6ccbcf2cb41131f8d56ef0723bf3f7c1f8486076 (diff) | |
parent | dab78d7924598ea4031663dd10db814e2e324928 (diff) |
Merge branch 'next' into for-linus
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 750 |
1 files changed, 579 insertions, 171 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index ccacdbdecf45..0e9344a71be3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void) | |||
196 | return sysctl_sched_rt_runtime >= 0; | 196 | return sysctl_sched_rt_runtime >= 0; |
197 | } | 197 | } |
198 | 198 | ||
199 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | 199 | static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) |
200 | { | 200 | { |
201 | ktime_t now; | 201 | unsigned long delta; |
202 | ktime_t soft, hard, now; | ||
203 | |||
204 | for (;;) { | ||
205 | if (hrtimer_active(period_timer)) | ||
206 | break; | ||
207 | |||
208 | now = hrtimer_cb_get_time(period_timer); | ||
209 | hrtimer_forward(period_timer, now, period); | ||
202 | 210 | ||
211 | soft = hrtimer_get_softexpires(period_timer); | ||
212 | hard = hrtimer_get_expires(period_timer); | ||
213 | delta = ktime_to_ns(ktime_sub(hard, soft)); | ||
214 | __hrtimer_start_range_ns(period_timer, soft, delta, | ||
215 | HRTIMER_MODE_ABS_PINNED, 0); | ||
216 | } | ||
217 | } | ||
218 | |||
219 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
220 | { | ||
203 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) | 221 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) |
204 | return; | 222 | return; |
205 | 223 | ||
@@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
207 | return; | 225 | return; |
208 | 226 | ||
209 | raw_spin_lock(&rt_b->rt_runtime_lock); | 227 | raw_spin_lock(&rt_b->rt_runtime_lock); |
210 | for (;;) { | 228 | start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); |
211 | unsigned long delta; | ||
212 | ktime_t soft, hard; | ||
213 | |||
214 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
215 | break; | ||
216 | |||
217 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); | ||
218 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); | ||
219 | |||
220 | soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); | ||
221 | hard = hrtimer_get_expires(&rt_b->rt_period_timer); | ||
222 | delta = ktime_to_ns(ktime_sub(hard, soft)); | ||
223 | __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, | ||
224 | HRTIMER_MODE_ABS_PINNED, 0); | ||
225 | } | ||
226 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 229 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
227 | } | 230 | } |
228 | 231 | ||
@@ -247,6 +250,24 @@ struct cfs_rq; | |||
247 | 250 | ||
248 | static LIST_HEAD(task_groups); | 251 | static LIST_HEAD(task_groups); |
249 | 252 | ||
253 | struct cfs_bandwidth { | ||
254 | #ifdef CONFIG_CFS_BANDWIDTH | ||
255 | raw_spinlock_t lock; | ||
256 | ktime_t period; | ||
257 | u64 quota, runtime; | ||
258 | s64 hierarchal_quota; | ||
259 | u64 runtime_expires; | ||
260 | |||
261 | int idle, timer_active; | ||
262 | struct hrtimer period_timer, slack_timer; | ||
263 | struct list_head throttled_cfs_rq; | ||
264 | |||
265 | /* statistics */ | ||
266 | int nr_periods, nr_throttled; | ||
267 | u64 throttled_time; | ||
268 | #endif | ||
269 | }; | ||
270 | |||
250 | /* task group related information */ | 271 | /* task group related information */ |
251 | struct task_group { | 272 | struct task_group { |
252 | struct cgroup_subsys_state css; | 273 | struct cgroup_subsys_state css; |
@@ -278,6 +299,8 @@ struct task_group { | |||
278 | #ifdef CONFIG_SCHED_AUTOGROUP | 299 | #ifdef CONFIG_SCHED_AUTOGROUP |
279 | struct autogroup *autogroup; | 300 | struct autogroup *autogroup; |
280 | #endif | 301 | #endif |
302 | |||
303 | struct cfs_bandwidth cfs_bandwidth; | ||
281 | }; | 304 | }; |
282 | 305 | ||
283 | /* task_group_lock serializes the addition/removal of task groups */ | 306 | /* task_group_lock serializes the addition/removal of task groups */ |
@@ -311,7 +334,7 @@ struct task_group root_task_group; | |||
311 | /* CFS-related fields in a runqueue */ | 334 | /* CFS-related fields in a runqueue */ |
312 | struct cfs_rq { | 335 | struct cfs_rq { |
313 | struct load_weight load; | 336 | struct load_weight load; |
314 | unsigned long nr_running; | 337 | unsigned long nr_running, h_nr_running; |
315 | 338 | ||
316 | u64 exec_clock; | 339 | u64 exec_clock; |
317 | u64 min_vruntime; | 340 | u64 min_vruntime; |
@@ -377,9 +400,120 @@ struct cfs_rq { | |||
377 | 400 | ||
378 | unsigned long load_contribution; | 401 | unsigned long load_contribution; |
379 | #endif | 402 | #endif |
403 | #ifdef CONFIG_CFS_BANDWIDTH | ||
404 | int runtime_enabled; | ||
405 | u64 runtime_expires; | ||
406 | s64 runtime_remaining; | ||
407 | |||
408 | u64 throttled_timestamp; | ||
409 | int throttled, throttle_count; | ||
410 | struct list_head throttled_list; | ||
411 | #endif | ||
380 | #endif | 412 | #endif |
381 | }; | 413 | }; |
382 | 414 | ||
415 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
416 | #ifdef CONFIG_CFS_BANDWIDTH | ||
417 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
418 | { | ||
419 | return &tg->cfs_bandwidth; | ||
420 | } | ||
421 | |||
422 | static inline u64 default_cfs_period(void); | ||
423 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); | ||
424 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); | ||
425 | |||
426 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) | ||
427 | { | ||
428 | struct cfs_bandwidth *cfs_b = | ||
429 | container_of(timer, struct cfs_bandwidth, slack_timer); | ||
430 | do_sched_cfs_slack_timer(cfs_b); | ||
431 | |||
432 | return HRTIMER_NORESTART; | ||
433 | } | ||
434 | |||
435 | static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | ||
436 | { | ||
437 | struct cfs_bandwidth *cfs_b = | ||
438 | container_of(timer, struct cfs_bandwidth, period_timer); | ||
439 | ktime_t now; | ||
440 | int overrun; | ||
441 | int idle = 0; | ||
442 | |||
443 | for (;;) { | ||
444 | now = hrtimer_cb_get_time(timer); | ||
445 | overrun = hrtimer_forward(timer, now, cfs_b->period); | ||
446 | |||
447 | if (!overrun) | ||
448 | break; | ||
449 | |||
450 | idle = do_sched_cfs_period_timer(cfs_b, overrun); | ||
451 | } | ||
452 | |||
453 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
454 | } | ||
455 | |||
456 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
457 | { | ||
458 | raw_spin_lock_init(&cfs_b->lock); | ||
459 | cfs_b->runtime = 0; | ||
460 | cfs_b->quota = RUNTIME_INF; | ||
461 | cfs_b->period = ns_to_ktime(default_cfs_period()); | ||
462 | |||
463 | INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); | ||
464 | hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
465 | cfs_b->period_timer.function = sched_cfs_period_timer; | ||
466 | hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
467 | cfs_b->slack_timer.function = sched_cfs_slack_timer; | ||
468 | } | ||
469 | |||
470 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
471 | { | ||
472 | cfs_rq->runtime_enabled = 0; | ||
473 | INIT_LIST_HEAD(&cfs_rq->throttled_list); | ||
474 | } | ||
475 | |||
476 | /* requires cfs_b->lock, may release to reprogram timer */ | ||
477 | static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
478 | { | ||
479 | /* | ||
480 | * The timer may be active because we're trying to set a new bandwidth | ||
481 | * period or because we're racing with the tear-down path | ||
482 | * (timer_active==0 becomes visible before the hrtimer call-back | ||
483 | * terminates). In either case we ensure that it's re-programmed | ||
484 | */ | ||
485 | while (unlikely(hrtimer_active(&cfs_b->period_timer))) { | ||
486 | raw_spin_unlock(&cfs_b->lock); | ||
487 | /* ensure cfs_b->lock is available while we wait */ | ||
488 | hrtimer_cancel(&cfs_b->period_timer); | ||
489 | |||
490 | raw_spin_lock(&cfs_b->lock); | ||
491 | /* if someone else restarted the timer then we're done */ | ||
492 | if (cfs_b->timer_active) | ||
493 | return; | ||
494 | } | ||
495 | |||
496 | cfs_b->timer_active = 1; | ||
497 | start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); | ||
498 | } | ||
499 | |||
500 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
501 | { | ||
502 | hrtimer_cancel(&cfs_b->period_timer); | ||
503 | hrtimer_cancel(&cfs_b->slack_timer); | ||
504 | } | ||
505 | #else | ||
506 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
507 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
508 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
509 | |||
510 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
511 | { | ||
512 | return NULL; | ||
513 | } | ||
514 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
515 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
516 | |||
383 | /* Real-Time classes' related field in a runqueue: */ | 517 | /* Real-Time classes' related field in a runqueue: */ |
384 | struct rt_rq { | 518 | struct rt_rq { |
385 | struct rt_prio_array active; | 519 | struct rt_prio_array active; |
@@ -510,7 +644,7 @@ struct rq { | |||
510 | 644 | ||
511 | unsigned long cpu_power; | 645 | unsigned long cpu_power; |
512 | 646 | ||
513 | unsigned char idle_at_tick; | 647 | unsigned char idle_balance; |
514 | /* For active balancing */ | 648 | /* For active balancing */ |
515 | int post_schedule; | 649 | int post_schedule; |
516 | int active_balance; | 650 | int active_balance; |
@@ -520,8 +654,6 @@ struct rq { | |||
520 | int cpu; | 654 | int cpu; |
521 | int online; | 655 | int online; |
522 | 656 | ||
523 | unsigned long avg_load_per_task; | ||
524 | |||
525 | u64 rt_avg; | 657 | u64 rt_avg; |
526 | u64 age_stamp; | 658 | u64 age_stamp; |
527 | u64 idle_stamp; | 659 | u64 idle_stamp; |
@@ -570,7 +702,7 @@ struct rq { | |||
570 | #endif | 702 | #endif |
571 | 703 | ||
572 | #ifdef CONFIG_SMP | 704 | #ifdef CONFIG_SMP |
573 | struct task_struct *wake_list; | 705 | struct llist_head wake_list; |
574 | #endif | 706 | #endif |
575 | }; | 707 | }; |
576 | 708 | ||
@@ -1272,6 +1404,18 @@ void wake_up_idle_cpu(int cpu) | |||
1272 | smp_send_reschedule(cpu); | 1404 | smp_send_reschedule(cpu); |
1273 | } | 1405 | } |
1274 | 1406 | ||
1407 | static inline bool got_nohz_idle_kick(void) | ||
1408 | { | ||
1409 | return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; | ||
1410 | } | ||
1411 | |||
1412 | #else /* CONFIG_NO_HZ */ | ||
1413 | |||
1414 | static inline bool got_nohz_idle_kick(void) | ||
1415 | { | ||
1416 | return false; | ||
1417 | } | ||
1418 | |||
1275 | #endif /* CONFIG_NO_HZ */ | 1419 | #endif /* CONFIG_NO_HZ */ |
1276 | 1420 | ||
1277 | static u64 sched_avg_period(void) | 1421 | static u64 sched_avg_period(void) |
@@ -1471,24 +1615,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) | |||
1471 | update_load_sub(&rq->load, load); | 1615 | update_load_sub(&rq->load, load); |
1472 | } | 1616 | } |
1473 | 1617 | ||
1474 | #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) | 1618 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ |
1619 | (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) | ||
1475 | typedef int (*tg_visitor)(struct task_group *, void *); | 1620 | typedef int (*tg_visitor)(struct task_group *, void *); |
1476 | 1621 | ||
1477 | /* | 1622 | /* |
1478 | * Iterate the full tree, calling @down when first entering a node and @up when | 1623 | * Iterate task_group tree rooted at *from, calling @down when first entering a |
1479 | * leaving it for the final time. | 1624 | * node and @up when leaving it for the final time. |
1625 | * | ||
1626 | * Caller must hold rcu_lock or sufficient equivalent. | ||
1480 | */ | 1627 | */ |
1481 | static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | 1628 | static int walk_tg_tree_from(struct task_group *from, |
1629 | tg_visitor down, tg_visitor up, void *data) | ||
1482 | { | 1630 | { |
1483 | struct task_group *parent, *child; | 1631 | struct task_group *parent, *child; |
1484 | int ret; | 1632 | int ret; |
1485 | 1633 | ||
1486 | rcu_read_lock(); | 1634 | parent = from; |
1487 | parent = &root_task_group; | 1635 | |
1488 | down: | 1636 | down: |
1489 | ret = (*down)(parent, data); | 1637 | ret = (*down)(parent, data); |
1490 | if (ret) | 1638 | if (ret) |
1491 | goto out_unlock; | 1639 | goto out; |
1492 | list_for_each_entry_rcu(child, &parent->children, siblings) { | 1640 | list_for_each_entry_rcu(child, &parent->children, siblings) { |
1493 | parent = child; | 1641 | parent = child; |
1494 | goto down; | 1642 | goto down; |
@@ -1497,19 +1645,29 @@ up: | |||
1497 | continue; | 1645 | continue; |
1498 | } | 1646 | } |
1499 | ret = (*up)(parent, data); | 1647 | ret = (*up)(parent, data); |
1500 | if (ret) | 1648 | if (ret || parent == from) |
1501 | goto out_unlock; | 1649 | goto out; |
1502 | 1650 | ||
1503 | child = parent; | 1651 | child = parent; |
1504 | parent = parent->parent; | 1652 | parent = parent->parent; |
1505 | if (parent) | 1653 | if (parent) |
1506 | goto up; | 1654 | goto up; |
1507 | out_unlock: | 1655 | out: |
1508 | rcu_read_unlock(); | ||
1509 | |||
1510 | return ret; | 1656 | return ret; |
1511 | } | 1657 | } |
1512 | 1658 | ||
1659 | /* | ||
1660 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
1661 | * leaving it for the final time. | ||
1662 | * | ||
1663 | * Caller must hold rcu_lock or sufficient equivalent. | ||
1664 | */ | ||
1665 | |||
1666 | static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | ||
1667 | { | ||
1668 | return walk_tg_tree_from(&root_task_group, down, up, data); | ||
1669 | } | ||
1670 | |||
1513 | static int tg_nop(struct task_group *tg, void *data) | 1671 | static int tg_nop(struct task_group *tg, void *data) |
1514 | { | 1672 | { |
1515 | return 0; | 1673 | return 0; |
@@ -1569,11 +1727,9 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1569 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | 1727 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); |
1570 | 1728 | ||
1571 | if (nr_running) | 1729 | if (nr_running) |
1572 | rq->avg_load_per_task = rq->load.weight / nr_running; | 1730 | return rq->load.weight / nr_running; |
1573 | else | ||
1574 | rq->avg_load_per_task = 0; | ||
1575 | 1731 | ||
1576 | return rq->avg_load_per_task; | 1732 | return 0; |
1577 | } | 1733 | } |
1578 | 1734 | ||
1579 | #ifdef CONFIG_PREEMPT | 1735 | #ifdef CONFIG_PREEMPT |
@@ -1739,7 +1895,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
1739 | #ifdef CONFIG_SMP | 1895 | #ifdef CONFIG_SMP |
1740 | /* | 1896 | /* |
1741 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | 1897 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be |
1742 | * successfuly executed on another CPU. We must ensure that updates of | 1898 | * successfully executed on another CPU. We must ensure that updates of |
1743 | * per-task data have been completed by this moment. | 1899 | * per-task data have been completed by this moment. |
1744 | */ | 1900 | */ |
1745 | smp_wmb(); | 1901 | smp_wmb(); |
@@ -1806,7 +1962,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1806 | rq->nr_uninterruptible--; | 1962 | rq->nr_uninterruptible--; |
1807 | 1963 | ||
1808 | enqueue_task(rq, p, flags); | 1964 | enqueue_task(rq, p, flags); |
1809 | inc_nr_running(rq); | ||
1810 | } | 1965 | } |
1811 | 1966 | ||
1812 | /* | 1967 | /* |
@@ -1818,7 +1973,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1818 | rq->nr_uninterruptible++; | 1973 | rq->nr_uninterruptible++; |
1819 | 1974 | ||
1820 | dequeue_task(rq, p, flags); | 1975 | dequeue_task(rq, p, flags); |
1821 | dec_nr_running(rq); | ||
1822 | } | 1976 | } |
1823 | 1977 | ||
1824 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 1978 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
@@ -2390,11 +2544,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2390 | 2544 | ||
2391 | /* Look for allowed, online CPU in same node. */ | 2545 | /* Look for allowed, online CPU in same node. */ |
2392 | for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) | 2546 | for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) |
2393 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 2547 | if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) |
2394 | return dest_cpu; | 2548 | return dest_cpu; |
2395 | 2549 | ||
2396 | /* Any allowed, online CPU? */ | 2550 | /* Any allowed, online CPU? */ |
2397 | dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); | 2551 | dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); |
2398 | if (dest_cpu < nr_cpu_ids) | 2552 | if (dest_cpu < nr_cpu_ids) |
2399 | return dest_cpu; | 2553 | return dest_cpu; |
2400 | 2554 | ||
@@ -2431,7 +2585,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) | |||
2431 | * [ this allows ->select_task() to simply return task_cpu(p) and | 2585 | * [ this allows ->select_task() to simply return task_cpu(p) and |
2432 | * not worry about this generic constraint ] | 2586 | * not worry about this generic constraint ] |
2433 | */ | 2587 | */ |
2434 | if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || | 2588 | if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || |
2435 | !cpu_online(cpu))) | 2589 | !cpu_online(cpu))) |
2436 | cpu = select_fallback_rq(task_cpu(p), p); | 2590 | cpu = select_fallback_rq(task_cpu(p), p); |
2437 | 2591 | ||
@@ -2556,42 +2710,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) | |||
2556 | } | 2710 | } |
2557 | 2711 | ||
2558 | #ifdef CONFIG_SMP | 2712 | #ifdef CONFIG_SMP |
2559 | static void sched_ttwu_do_pending(struct task_struct *list) | 2713 | static void sched_ttwu_pending(void) |
2560 | { | 2714 | { |
2561 | struct rq *rq = this_rq(); | 2715 | struct rq *rq = this_rq(); |
2716 | struct llist_node *llist = llist_del_all(&rq->wake_list); | ||
2717 | struct task_struct *p; | ||
2562 | 2718 | ||
2563 | raw_spin_lock(&rq->lock); | 2719 | raw_spin_lock(&rq->lock); |
2564 | 2720 | ||
2565 | while (list) { | 2721 | while (llist) { |
2566 | struct task_struct *p = list; | 2722 | p = llist_entry(llist, struct task_struct, wake_entry); |
2567 | list = list->wake_entry; | 2723 | llist = llist_next(llist); |
2568 | ttwu_do_activate(rq, p, 0); | 2724 | ttwu_do_activate(rq, p, 0); |
2569 | } | 2725 | } |
2570 | 2726 | ||
2571 | raw_spin_unlock(&rq->lock); | 2727 | raw_spin_unlock(&rq->lock); |
2572 | } | 2728 | } |
2573 | 2729 | ||
2574 | #ifdef CONFIG_HOTPLUG_CPU | ||
2575 | |||
2576 | static void sched_ttwu_pending(void) | ||
2577 | { | ||
2578 | struct rq *rq = this_rq(); | ||
2579 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2580 | |||
2581 | if (!list) | ||
2582 | return; | ||
2583 | |||
2584 | sched_ttwu_do_pending(list); | ||
2585 | } | ||
2586 | |||
2587 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
2588 | |||
2589 | void scheduler_ipi(void) | 2730 | void scheduler_ipi(void) |
2590 | { | 2731 | { |
2591 | struct rq *rq = this_rq(); | 2732 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) |
2592 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2593 | |||
2594 | if (!list) | ||
2595 | return; | 2733 | return; |
2596 | 2734 | ||
2597 | /* | 2735 | /* |
@@ -2608,25 +2746,21 @@ void scheduler_ipi(void) | |||
2608 | * somewhat pessimize the simple resched case. | 2746 | * somewhat pessimize the simple resched case. |
2609 | */ | 2747 | */ |
2610 | irq_enter(); | 2748 | irq_enter(); |
2611 | sched_ttwu_do_pending(list); | 2749 | sched_ttwu_pending(); |
2750 | |||
2751 | /* | ||
2752 | * Check if someone kicked us for doing the nohz idle load balance. | ||
2753 | */ | ||
2754 | if (unlikely(got_nohz_idle_kick() && !need_resched())) { | ||
2755 | this_rq()->idle_balance = 1; | ||
2756 | raise_softirq_irqoff(SCHED_SOFTIRQ); | ||
2757 | } | ||
2612 | irq_exit(); | 2758 | irq_exit(); |
2613 | } | 2759 | } |
2614 | 2760 | ||
2615 | static void ttwu_queue_remote(struct task_struct *p, int cpu) | 2761 | static void ttwu_queue_remote(struct task_struct *p, int cpu) |
2616 | { | 2762 | { |
2617 | struct rq *rq = cpu_rq(cpu); | 2763 | if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) |
2618 | struct task_struct *next = rq->wake_list; | ||
2619 | |||
2620 | for (;;) { | ||
2621 | struct task_struct *old = next; | ||
2622 | |||
2623 | p->wake_entry = next; | ||
2624 | next = cmpxchg(&rq->wake_list, old, p); | ||
2625 | if (next == old) | ||
2626 | break; | ||
2627 | } | ||
2628 | |||
2629 | if (!next) | ||
2630 | smp_send_reschedule(cpu); | 2764 | smp_send_reschedule(cpu); |
2631 | } | 2765 | } |
2632 | 2766 | ||
@@ -2848,19 +2982,23 @@ void sched_fork(struct task_struct *p) | |||
2848 | p->state = TASK_RUNNING; | 2982 | p->state = TASK_RUNNING; |
2849 | 2983 | ||
2850 | /* | 2984 | /* |
2985 | * Make sure we do not leak PI boosting priority to the child. | ||
2986 | */ | ||
2987 | p->prio = current->normal_prio; | ||
2988 | |||
2989 | /* | ||
2851 | * Revert to default priority/policy on fork if requested. | 2990 | * Revert to default priority/policy on fork if requested. |
2852 | */ | 2991 | */ |
2853 | if (unlikely(p->sched_reset_on_fork)) { | 2992 | if (unlikely(p->sched_reset_on_fork)) { |
2854 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { | 2993 | if (task_has_rt_policy(p)) { |
2855 | p->policy = SCHED_NORMAL; | 2994 | p->policy = SCHED_NORMAL; |
2856 | p->normal_prio = p->static_prio; | ||
2857 | } | ||
2858 | |||
2859 | if (PRIO_TO_NICE(p->static_prio) < 0) { | ||
2860 | p->static_prio = NICE_TO_PRIO(0); | 2995 | p->static_prio = NICE_TO_PRIO(0); |
2861 | p->normal_prio = p->static_prio; | 2996 | p->rt_priority = 0; |
2862 | set_load_weight(p); | 2997 | } else if (PRIO_TO_NICE(p->static_prio) < 0) |
2863 | } | 2998 | p->static_prio = NICE_TO_PRIO(0); |
2999 | |||
3000 | p->prio = p->normal_prio = __normal_prio(p); | ||
3001 | set_load_weight(p); | ||
2864 | 3002 | ||
2865 | /* | 3003 | /* |
2866 | * We don't need the reset flag anymore after the fork. It has | 3004 | * We don't need the reset flag anymore after the fork. It has |
@@ -2869,11 +3007,6 @@ void sched_fork(struct task_struct *p) | |||
2869 | p->sched_reset_on_fork = 0; | 3007 | p->sched_reset_on_fork = 0; |
2870 | } | 3008 | } |
2871 | 3009 | ||
2872 | /* | ||
2873 | * Make sure we do not leak PI boosting priority to the child. | ||
2874 | */ | ||
2875 | p->prio = current->normal_prio; | ||
2876 | |||
2877 | if (!rt_prio(p->prio)) | 3010 | if (!rt_prio(p->prio)) |
2878 | p->sched_class = &fair_sched_class; | 3011 | p->sched_class = &fair_sched_class; |
2879 | 3012 | ||
@@ -3065,7 +3198,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
3065 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 3198 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
3066 | local_irq_disable(); | 3199 | local_irq_disable(); |
3067 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | 3200 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ |
3068 | perf_event_task_sched_in(current); | 3201 | perf_event_task_sched_in(prev, current); |
3069 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 3202 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
3070 | local_irq_enable(); | 3203 | local_irq_enable(); |
3071 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | 3204 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ |
@@ -3725,30 +3858,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3725 | } | 3858 | } |
3726 | 3859 | ||
3727 | /* | 3860 | /* |
3728 | * Return sum_exec_runtime for the thread group. | ||
3729 | * In case the task is currently running, return the sum plus current's | ||
3730 | * pending runtime that have not been accounted yet. | ||
3731 | * | ||
3732 | * Note that the thread group might have other running tasks as well, | ||
3733 | * so the return value not includes other pending runtime that other | ||
3734 | * running tasks might have. | ||
3735 | */ | ||
3736 | unsigned long long thread_group_sched_runtime(struct task_struct *p) | ||
3737 | { | ||
3738 | struct task_cputime totals; | ||
3739 | unsigned long flags; | ||
3740 | struct rq *rq; | ||
3741 | u64 ns; | ||
3742 | |||
3743 | rq = task_rq_lock(p, &flags); | ||
3744 | thread_group_cputime(p, &totals); | ||
3745 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); | ||
3746 | task_rq_unlock(rq, p, &flags); | ||
3747 | |||
3748 | return ns; | ||
3749 | } | ||
3750 | |||
3751 | /* | ||
3752 | * Account user cpu time to a process. | 3861 | * Account user cpu time to a process. |
3753 | * @p: the process that the cpu time gets accounted to | 3862 | * @p: the process that the cpu time gets accounted to |
3754 | * @cputime: the cpu time spent in user space since the last update | 3863 | * @cputime: the cpu time spent in user space since the last update |
@@ -4140,7 +4249,7 @@ void scheduler_tick(void) | |||
4140 | perf_event_task_tick(); | 4249 | perf_event_task_tick(); |
4141 | 4250 | ||
4142 | #ifdef CONFIG_SMP | 4251 | #ifdef CONFIG_SMP |
4143 | rq->idle_at_tick = idle_cpu(cpu); | 4252 | rq->idle_balance = idle_cpu(cpu); |
4144 | trigger_load_balance(rq, cpu); | 4253 | trigger_load_balance(rq, cpu); |
4145 | #endif | 4254 | #endif |
4146 | } | 4255 | } |
@@ -4237,6 +4346,7 @@ static inline void schedule_debug(struct task_struct *prev) | |||
4237 | */ | 4346 | */ |
4238 | if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) | 4347 | if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) |
4239 | __schedule_bug(prev); | 4348 | __schedule_bug(prev); |
4349 | rcu_sleep_check(); | ||
4240 | 4350 | ||
4241 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 4351 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
4242 | 4352 | ||
@@ -4263,7 +4373,7 @@ pick_next_task(struct rq *rq) | |||
4263 | * Optimization: we know that if all tasks are in | 4373 | * Optimization: we know that if all tasks are in |
4264 | * the fair class we can call that function directly: | 4374 | * the fair class we can call that function directly: |
4265 | */ | 4375 | */ |
4266 | if (likely(rq->nr_running == rq->cfs.nr_running)) { | 4376 | if (likely(rq->nr_running == rq->cfs.h_nr_running)) { |
4267 | p = fair_sched_class.pick_next_task(rq); | 4377 | p = fair_sched_class.pick_next_task(rq); |
4268 | if (likely(p)) | 4378 | if (likely(p)) |
4269 | return p; | 4379 | return p; |
@@ -4279,9 +4389,9 @@ pick_next_task(struct rq *rq) | |||
4279 | } | 4389 | } |
4280 | 4390 | ||
4281 | /* | 4391 | /* |
4282 | * schedule() is the main scheduler function. | 4392 | * __schedule() is the main scheduler function. |
4283 | */ | 4393 | */ |
4284 | asmlinkage void __sched schedule(void) | 4394 | static void __sched __schedule(void) |
4285 | { | 4395 | { |
4286 | struct task_struct *prev, *next; | 4396 | struct task_struct *prev, *next; |
4287 | unsigned long *switch_count; | 4397 | unsigned long *switch_count; |
@@ -4322,16 +4432,6 @@ need_resched: | |||
4322 | if (to_wakeup) | 4432 | if (to_wakeup) |
4323 | try_to_wake_up_local(to_wakeup); | 4433 | try_to_wake_up_local(to_wakeup); |
4324 | } | 4434 | } |
4325 | |||
4326 | /* | ||
4327 | * If we are going to sleep and we have plugged IO | ||
4328 | * queued, make sure to submit it to avoid deadlocks. | ||
4329 | */ | ||
4330 | if (blk_needs_flush_plug(prev)) { | ||
4331 | raw_spin_unlock(&rq->lock); | ||
4332 | blk_schedule_flush_plug(prev); | ||
4333 | raw_spin_lock(&rq->lock); | ||
4334 | } | ||
4335 | } | 4435 | } |
4336 | switch_count = &prev->nvcsw; | 4436 | switch_count = &prev->nvcsw; |
4337 | } | 4437 | } |
@@ -4369,6 +4469,26 @@ need_resched: | |||
4369 | if (need_resched()) | 4469 | if (need_resched()) |
4370 | goto need_resched; | 4470 | goto need_resched; |
4371 | } | 4471 | } |
4472 | |||
4473 | static inline void sched_submit_work(struct task_struct *tsk) | ||
4474 | { | ||
4475 | if (!tsk->state) | ||
4476 | return; | ||
4477 | /* | ||
4478 | * If we are going to sleep and we have plugged IO queued, | ||
4479 | * make sure to submit it to avoid deadlocks. | ||
4480 | */ | ||
4481 | if (blk_needs_flush_plug(tsk)) | ||
4482 | blk_schedule_flush_plug(tsk); | ||
4483 | } | ||
4484 | |||
4485 | asmlinkage void __sched schedule(void) | ||
4486 | { | ||
4487 | struct task_struct *tsk = current; | ||
4488 | |||
4489 | sched_submit_work(tsk); | ||
4490 | __schedule(); | ||
4491 | } | ||
4372 | EXPORT_SYMBOL(schedule); | 4492 | EXPORT_SYMBOL(schedule); |
4373 | 4493 | ||
4374 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 4494 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
@@ -4435,7 +4555,7 @@ asmlinkage void __sched notrace preempt_schedule(void) | |||
4435 | 4555 | ||
4436 | do { | 4556 | do { |
4437 | add_preempt_count_notrace(PREEMPT_ACTIVE); | 4557 | add_preempt_count_notrace(PREEMPT_ACTIVE); |
4438 | schedule(); | 4558 | __schedule(); |
4439 | sub_preempt_count_notrace(PREEMPT_ACTIVE); | 4559 | sub_preempt_count_notrace(PREEMPT_ACTIVE); |
4440 | 4560 | ||
4441 | /* | 4561 | /* |
@@ -4463,7 +4583,7 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
4463 | do { | 4583 | do { |
4464 | add_preempt_count(PREEMPT_ACTIVE); | 4584 | add_preempt_count(PREEMPT_ACTIVE); |
4465 | local_irq_enable(); | 4585 | local_irq_enable(); |
4466 | schedule(); | 4586 | __schedule(); |
4467 | local_irq_disable(); | 4587 | local_irq_disable(); |
4468 | sub_preempt_count(PREEMPT_ACTIVE); | 4588 | sub_preempt_count(PREEMPT_ACTIVE); |
4469 | 4589 | ||
@@ -5039,7 +5159,20 @@ EXPORT_SYMBOL(task_nice); | |||
5039 | */ | 5159 | */ |
5040 | int idle_cpu(int cpu) | 5160 | int idle_cpu(int cpu) |
5041 | { | 5161 | { |
5042 | return cpu_curr(cpu) == cpu_rq(cpu)->idle; | 5162 | struct rq *rq = cpu_rq(cpu); |
5163 | |||
5164 | if (rq->curr != rq->idle) | ||
5165 | return 0; | ||
5166 | |||
5167 | if (rq->nr_running) | ||
5168 | return 0; | ||
5169 | |||
5170 | #ifdef CONFIG_SMP | ||
5171 | if (!llist_empty(&rq->wake_list)) | ||
5172 | return 0; | ||
5173 | #endif | ||
5174 | |||
5175 | return 1; | ||
5043 | } | 5176 | } |
5044 | 5177 | ||
5045 | /** | 5178 | /** |
@@ -5588,7 +5721,7 @@ static inline int should_resched(void) | |||
5588 | static void __cond_resched(void) | 5721 | static void __cond_resched(void) |
5589 | { | 5722 | { |
5590 | add_preempt_count(PREEMPT_ACTIVE); | 5723 | add_preempt_count(PREEMPT_ACTIVE); |
5591 | schedule(); | 5724 | __schedule(); |
5592 | sub_preempt_count(PREEMPT_ACTIVE); | 5725 | sub_preempt_count(PREEMPT_ACTIVE); |
5593 | } | 5726 | } |
5594 | 5727 | ||
@@ -5889,7 +6022,7 @@ void show_state_filter(unsigned long state_filter) | |||
5889 | printk(KERN_INFO | 6022 | printk(KERN_INFO |
5890 | " task PC stack pid father\n"); | 6023 | " task PC stack pid father\n"); |
5891 | #endif | 6024 | #endif |
5892 | read_lock(&tasklist_lock); | 6025 | rcu_read_lock(); |
5893 | do_each_thread(g, p) { | 6026 | do_each_thread(g, p) { |
5894 | /* | 6027 | /* |
5895 | * reset the NMI-timeout, listing all files on a slow | 6028 | * reset the NMI-timeout, listing all files on a slow |
@@ -5905,7 +6038,7 @@ void show_state_filter(unsigned long state_filter) | |||
5905 | #ifdef CONFIG_SCHED_DEBUG | 6038 | #ifdef CONFIG_SCHED_DEBUG |
5906 | sysrq_sched_debug_show(); | 6039 | sysrq_sched_debug_show(); |
5907 | #endif | 6040 | #endif |
5908 | read_unlock(&tasklist_lock); | 6041 | rcu_read_unlock(); |
5909 | /* | 6042 | /* |
5910 | * Only show locks if all tasks are dumped: | 6043 | * Only show locks if all tasks are dumped: |
5911 | */ | 6044 | */ |
@@ -5969,15 +6102,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5969 | } | 6102 | } |
5970 | 6103 | ||
5971 | /* | 6104 | /* |
5972 | * In a system that switches off the HZ timer nohz_cpu_mask | ||
5973 | * indicates which cpus entered this state. This is used | ||
5974 | * in the rcu update to wait only for active cpus. For system | ||
5975 | * which do not switch off the HZ timer nohz_cpu_mask should | ||
5976 | * always be CPU_BITS_NONE. | ||
5977 | */ | ||
5978 | cpumask_var_t nohz_cpu_mask; | ||
5979 | |||
5980 | /* | ||
5981 | * Increase the granularity value when there are more CPUs, | 6105 | * Increase the granularity value when there are more CPUs, |
5982 | * because with more CPUs the 'effective latency' as visible | 6106 | * because with more CPUs the 'effective latency' as visible |
5983 | * to users decreases. But the relationship is not linear, | 6107 | * to users decreases. But the relationship is not linear, |
@@ -6029,10 +6153,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
6029 | { | 6153 | { |
6030 | if (p->sched_class && p->sched_class->set_cpus_allowed) | 6154 | if (p->sched_class && p->sched_class->set_cpus_allowed) |
6031 | p->sched_class->set_cpus_allowed(p, new_mask); | 6155 | p->sched_class->set_cpus_allowed(p, new_mask); |
6032 | else { | 6156 | |
6033 | cpumask_copy(&p->cpus_allowed, new_mask); | 6157 | cpumask_copy(&p->cpus_allowed, new_mask); |
6034 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | 6158 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); |
6035 | } | ||
6036 | } | 6159 | } |
6037 | 6160 | ||
6038 | /* | 6161 | /* |
@@ -6130,7 +6253,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
6130 | if (task_cpu(p) != src_cpu) | 6253 | if (task_cpu(p) != src_cpu) |
6131 | goto done; | 6254 | goto done; |
6132 | /* Affinity changed (again). */ | 6255 | /* Affinity changed (again). */ |
6133 | if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 6256 | if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) |
6134 | goto fail; | 6257 | goto fail; |
6135 | 6258 | ||
6136 | /* | 6259 | /* |
@@ -6211,6 +6334,30 @@ static void calc_global_load_remove(struct rq *rq) | |||
6211 | rq->calc_load_active = 0; | 6334 | rq->calc_load_active = 0; |
6212 | } | 6335 | } |
6213 | 6336 | ||
6337 | #ifdef CONFIG_CFS_BANDWIDTH | ||
6338 | static void unthrottle_offline_cfs_rqs(struct rq *rq) | ||
6339 | { | ||
6340 | struct cfs_rq *cfs_rq; | ||
6341 | |||
6342 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
6343 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
6344 | |||
6345 | if (!cfs_rq->runtime_enabled) | ||
6346 | continue; | ||
6347 | |||
6348 | /* | ||
6349 | * clock_task is not advancing so we just need to make sure | ||
6350 | * there's some valid quota amount | ||
6351 | */ | ||
6352 | cfs_rq->runtime_remaining = cfs_b->quota; | ||
6353 | if (cfs_rq_throttled(cfs_rq)) | ||
6354 | unthrottle_cfs_rq(cfs_rq); | ||
6355 | } | ||
6356 | } | ||
6357 | #else | ||
6358 | static void unthrottle_offline_cfs_rqs(struct rq *rq) {} | ||
6359 | #endif | ||
6360 | |||
6214 | /* | 6361 | /* |
6215 | * Migrate all tasks from the rq, sleeping tasks will be migrated by | 6362 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
6216 | * try_to_wake_up()->select_task_rq(). | 6363 | * try_to_wake_up()->select_task_rq(). |
@@ -6236,6 +6383,9 @@ static void migrate_tasks(unsigned int dead_cpu) | |||
6236 | */ | 6383 | */ |
6237 | rq->stop = NULL; | 6384 | rq->stop = NULL; |
6238 | 6385 | ||
6386 | /* Ensure any throttled groups are reachable by pick_next_task */ | ||
6387 | unthrottle_offline_cfs_rqs(rq); | ||
6388 | |||
6239 | for ( ; ; ) { | 6389 | for ( ; ; ) { |
6240 | /* | 6390 | /* |
6241 | * There's this thread running, bail when that's the only | 6391 | * There's this thread running, bail when that's the only |
@@ -6937,8 +7087,6 @@ static int __init isolated_cpu_setup(char *str) | |||
6937 | 7087 | ||
6938 | __setup("isolcpus=", isolated_cpu_setup); | 7088 | __setup("isolcpus=", isolated_cpu_setup); |
6939 | 7089 | ||
6940 | #define SD_NODES_PER_DOMAIN 16 | ||
6941 | |||
6942 | #ifdef CONFIG_NUMA | 7090 | #ifdef CONFIG_NUMA |
6943 | 7091 | ||
6944 | /** | 7092 | /** |
@@ -7443,6 +7591,7 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
7443 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); | 7591 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); |
7444 | if (sd && (sd->flags & SD_OVERLAP)) | 7592 | if (sd && (sd->flags & SD_OVERLAP)) |
7445 | free_sched_groups(sd->groups, 0); | 7593 | free_sched_groups(sd->groups, 0); |
7594 | kfree(*per_cpu_ptr(sdd->sd, j)); | ||
7446 | kfree(*per_cpu_ptr(sdd->sg, j)); | 7595 | kfree(*per_cpu_ptr(sdd->sg, j)); |
7447 | kfree(*per_cpu_ptr(sdd->sgp, j)); | 7596 | kfree(*per_cpu_ptr(sdd->sgp, j)); |
7448 | } | 7597 | } |
@@ -7978,6 +8127,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
7978 | /* allow initial update_cfs_load() to truncate */ | 8127 | /* allow initial update_cfs_load() to truncate */ |
7979 | cfs_rq->load_stamp = 1; | 8128 | cfs_rq->load_stamp = 1; |
7980 | #endif | 8129 | #endif |
8130 | init_cfs_rq_runtime(cfs_rq); | ||
7981 | 8131 | ||
7982 | tg->cfs_rq[cpu] = cfs_rq; | 8132 | tg->cfs_rq[cpu] = cfs_rq; |
7983 | tg->se[cpu] = se; | 8133 | tg->se[cpu] = se; |
@@ -8117,6 +8267,7 @@ void __init sched_init(void) | |||
8117 | * We achieve this by letting root_task_group's tasks sit | 8267 | * We achieve this by letting root_task_group's tasks sit |
8118 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). | 8268 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). |
8119 | */ | 8269 | */ |
8270 | init_cfs_bandwidth(&root_task_group.cfs_bandwidth); | ||
8120 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); | 8271 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); |
8121 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8272 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8122 | 8273 | ||
@@ -8146,7 +8297,6 @@ void __init sched_init(void) | |||
8146 | rq_attach_root(rq, &def_root_domain); | 8297 | rq_attach_root(rq, &def_root_domain); |
8147 | #ifdef CONFIG_NO_HZ | 8298 | #ifdef CONFIG_NO_HZ |
8148 | rq->nohz_balance_kick = 0; | 8299 | rq->nohz_balance_kick = 0; |
8149 | init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); | ||
8150 | #endif | 8300 | #endif |
8151 | #endif | 8301 | #endif |
8152 | init_rq_hrtick(rq); | 8302 | init_rq_hrtick(rq); |
@@ -8188,8 +8338,6 @@ void __init sched_init(void) | |||
8188 | */ | 8338 | */ |
8189 | current->sched_class = &fair_sched_class; | 8339 | current->sched_class = &fair_sched_class; |
8190 | 8340 | ||
8191 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ | ||
8192 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | ||
8193 | #ifdef CONFIG_SMP | 8341 | #ifdef CONFIG_SMP |
8194 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); | 8342 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); |
8195 | #ifdef CONFIG_NO_HZ | 8343 | #ifdef CONFIG_NO_HZ |
@@ -8219,6 +8367,7 @@ void __might_sleep(const char *file, int line, int preempt_offset) | |||
8219 | { | 8367 | { |
8220 | static unsigned long prev_jiffy; /* ratelimiting */ | 8368 | static unsigned long prev_jiffy; /* ratelimiting */ |
8221 | 8369 | ||
8370 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ | ||
8222 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || | 8371 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || |
8223 | system_state != SYSTEM_RUNNING || oops_in_progress) | 8372 | system_state != SYSTEM_RUNNING || oops_in_progress) |
8224 | return; | 8373 | return; |
@@ -8358,6 +8507,8 @@ static void free_fair_sched_group(struct task_group *tg) | |||
8358 | { | 8507 | { |
8359 | int i; | 8508 | int i; |
8360 | 8509 | ||
8510 | destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
8511 | |||
8361 | for_each_possible_cpu(i) { | 8512 | for_each_possible_cpu(i) { |
8362 | if (tg->cfs_rq) | 8513 | if (tg->cfs_rq) |
8363 | kfree(tg->cfs_rq[i]); | 8514 | kfree(tg->cfs_rq[i]); |
@@ -8385,6 +8536,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8385 | 8536 | ||
8386 | tg->shares = NICE_0_LOAD; | 8537 | tg->shares = NICE_0_LOAD; |
8387 | 8538 | ||
8539 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
8540 | |||
8388 | for_each_possible_cpu(i) { | 8541 | for_each_possible_cpu(i) { |
8389 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | 8542 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
8390 | GFP_KERNEL, cpu_to_node(i)); | 8543 | GFP_KERNEL, cpu_to_node(i)); |
@@ -8660,12 +8813,7 @@ unsigned long sched_group_shares(struct task_group *tg) | |||
8660 | } | 8813 | } |
8661 | #endif | 8814 | #endif |
8662 | 8815 | ||
8663 | #ifdef CONFIG_RT_GROUP_SCHED | 8816 | #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) |
8664 | /* | ||
8665 | * Ensure that the real time constraints are schedulable. | ||
8666 | */ | ||
8667 | static DEFINE_MUTEX(rt_constraints_mutex); | ||
8668 | |||
8669 | static unsigned long to_ratio(u64 period, u64 runtime) | 8817 | static unsigned long to_ratio(u64 period, u64 runtime) |
8670 | { | 8818 | { |
8671 | if (runtime == RUNTIME_INF) | 8819 | if (runtime == RUNTIME_INF) |
@@ -8673,6 +8821,13 @@ static unsigned long to_ratio(u64 period, u64 runtime) | |||
8673 | 8821 | ||
8674 | return div64_u64(runtime << 20, period); | 8822 | return div64_u64(runtime << 20, period); |
8675 | } | 8823 | } |
8824 | #endif | ||
8825 | |||
8826 | #ifdef CONFIG_RT_GROUP_SCHED | ||
8827 | /* | ||
8828 | * Ensure that the real time constraints are schedulable. | ||
8829 | */ | ||
8830 | static DEFINE_MUTEX(rt_constraints_mutex); | ||
8676 | 8831 | ||
8677 | /* Must be called with tasklist_lock held */ | 8832 | /* Must be called with tasklist_lock held */ |
8678 | static inline int tg_has_rt_tasks(struct task_group *tg) | 8833 | static inline int tg_has_rt_tasks(struct task_group *tg) |
@@ -8693,7 +8848,7 @@ struct rt_schedulable_data { | |||
8693 | u64 rt_runtime; | 8848 | u64 rt_runtime; |
8694 | }; | 8849 | }; |
8695 | 8850 | ||
8696 | static int tg_schedulable(struct task_group *tg, void *data) | 8851 | static int tg_rt_schedulable(struct task_group *tg, void *data) |
8697 | { | 8852 | { |
8698 | struct rt_schedulable_data *d = data; | 8853 | struct rt_schedulable_data *d = data; |
8699 | struct task_group *child; | 8854 | struct task_group *child; |
@@ -8751,16 +8906,22 @@ static int tg_schedulable(struct task_group *tg, void *data) | |||
8751 | 8906 | ||
8752 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8907 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
8753 | { | 8908 | { |
8909 | int ret; | ||
8910 | |||
8754 | struct rt_schedulable_data data = { | 8911 | struct rt_schedulable_data data = { |
8755 | .tg = tg, | 8912 | .tg = tg, |
8756 | .rt_period = period, | 8913 | .rt_period = period, |
8757 | .rt_runtime = runtime, | 8914 | .rt_runtime = runtime, |
8758 | }; | 8915 | }; |
8759 | 8916 | ||
8760 | return walk_tg_tree(tg_schedulable, tg_nop, &data); | 8917 | rcu_read_lock(); |
8918 | ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); | ||
8919 | rcu_read_unlock(); | ||
8920 | |||
8921 | return ret; | ||
8761 | } | 8922 | } |
8762 | 8923 | ||
8763 | static int tg_set_bandwidth(struct task_group *tg, | 8924 | static int tg_set_rt_bandwidth(struct task_group *tg, |
8764 | u64 rt_period, u64 rt_runtime) | 8925 | u64 rt_period, u64 rt_runtime) |
8765 | { | 8926 | { |
8766 | int i, err = 0; | 8927 | int i, err = 0; |
@@ -8799,7 +8960,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | |||
8799 | if (rt_runtime_us < 0) | 8960 | if (rt_runtime_us < 0) |
8800 | rt_runtime = RUNTIME_INF; | 8961 | rt_runtime = RUNTIME_INF; |
8801 | 8962 | ||
8802 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8963 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
8803 | } | 8964 | } |
8804 | 8965 | ||
8805 | long sched_group_rt_runtime(struct task_group *tg) | 8966 | long sched_group_rt_runtime(struct task_group *tg) |
@@ -8824,7 +8985,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | |||
8824 | if (rt_period == 0) | 8985 | if (rt_period == 0) |
8825 | return -EINVAL; | 8986 | return -EINVAL; |
8826 | 8987 | ||
8827 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8988 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
8828 | } | 8989 | } |
8829 | 8990 | ||
8830 | long sched_group_rt_period(struct task_group *tg) | 8991 | long sched_group_rt_period(struct task_group *tg) |
@@ -9014,6 +9175,238 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | |||
9014 | 9175 | ||
9015 | return (u64) scale_load_down(tg->shares); | 9176 | return (u64) scale_load_down(tg->shares); |
9016 | } | 9177 | } |
9178 | |||
9179 | #ifdef CONFIG_CFS_BANDWIDTH | ||
9180 | static DEFINE_MUTEX(cfs_constraints_mutex); | ||
9181 | |||
9182 | const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ | ||
9183 | const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ | ||
9184 | |||
9185 | static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); | ||
9186 | |||
9187 | static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | ||
9188 | { | ||
9189 | int i, ret = 0, runtime_enabled; | ||
9190 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
9191 | |||
9192 | if (tg == &root_task_group) | ||
9193 | return -EINVAL; | ||
9194 | |||
9195 | /* | ||
9196 | * Ensure we have at some amount of bandwidth every period. This is | ||
9197 | * to prevent reaching a state of large arrears when throttled via | ||
9198 | * entity_tick() resulting in prolonged exit starvation. | ||
9199 | */ | ||
9200 | if (quota < min_cfs_quota_period || period < min_cfs_quota_period) | ||
9201 | return -EINVAL; | ||
9202 | |||
9203 | /* | ||
9204 | * Likewise, bound things on the otherside by preventing insane quota | ||
9205 | * periods. This also allows us to normalize in computing quota | ||
9206 | * feasibility. | ||
9207 | */ | ||
9208 | if (period > max_cfs_quota_period) | ||
9209 | return -EINVAL; | ||
9210 | |||
9211 | mutex_lock(&cfs_constraints_mutex); | ||
9212 | ret = __cfs_schedulable(tg, period, quota); | ||
9213 | if (ret) | ||
9214 | goto out_unlock; | ||
9215 | |||
9216 | runtime_enabled = quota != RUNTIME_INF; | ||
9217 | raw_spin_lock_irq(&cfs_b->lock); | ||
9218 | cfs_b->period = ns_to_ktime(period); | ||
9219 | cfs_b->quota = quota; | ||
9220 | |||
9221 | __refill_cfs_bandwidth_runtime(cfs_b); | ||
9222 | /* restart the period timer (if active) to handle new period expiry */ | ||
9223 | if (runtime_enabled && cfs_b->timer_active) { | ||
9224 | /* force a reprogram */ | ||
9225 | cfs_b->timer_active = 0; | ||
9226 | __start_cfs_bandwidth(cfs_b); | ||
9227 | } | ||
9228 | raw_spin_unlock_irq(&cfs_b->lock); | ||
9229 | |||
9230 | for_each_possible_cpu(i) { | ||
9231 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; | ||
9232 | struct rq *rq = rq_of(cfs_rq); | ||
9233 | |||
9234 | raw_spin_lock_irq(&rq->lock); | ||
9235 | cfs_rq->runtime_enabled = runtime_enabled; | ||
9236 | cfs_rq->runtime_remaining = 0; | ||
9237 | |||
9238 | if (cfs_rq_throttled(cfs_rq)) | ||
9239 | unthrottle_cfs_rq(cfs_rq); | ||
9240 | raw_spin_unlock_irq(&rq->lock); | ||
9241 | } | ||
9242 | out_unlock: | ||
9243 | mutex_unlock(&cfs_constraints_mutex); | ||
9244 | |||
9245 | return ret; | ||
9246 | } | ||
9247 | |||
9248 | int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) | ||
9249 | { | ||
9250 | u64 quota, period; | ||
9251 | |||
9252 | period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | ||
9253 | if (cfs_quota_us < 0) | ||
9254 | quota = RUNTIME_INF; | ||
9255 | else | ||
9256 | quota = (u64)cfs_quota_us * NSEC_PER_USEC; | ||
9257 | |||
9258 | return tg_set_cfs_bandwidth(tg, period, quota); | ||
9259 | } | ||
9260 | |||
9261 | long tg_get_cfs_quota(struct task_group *tg) | ||
9262 | { | ||
9263 | u64 quota_us; | ||
9264 | |||
9265 | if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) | ||
9266 | return -1; | ||
9267 | |||
9268 | quota_us = tg_cfs_bandwidth(tg)->quota; | ||
9269 | do_div(quota_us, NSEC_PER_USEC); | ||
9270 | |||
9271 | return quota_us; | ||
9272 | } | ||
9273 | |||
9274 | int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) | ||
9275 | { | ||
9276 | u64 quota, period; | ||
9277 | |||
9278 | period = (u64)cfs_period_us * NSEC_PER_USEC; | ||
9279 | quota = tg_cfs_bandwidth(tg)->quota; | ||
9280 | |||
9281 | if (period <= 0) | ||
9282 | return -EINVAL; | ||
9283 | |||
9284 | return tg_set_cfs_bandwidth(tg, period, quota); | ||
9285 | } | ||
9286 | |||
9287 | long tg_get_cfs_period(struct task_group *tg) | ||
9288 | { | ||
9289 | u64 cfs_period_us; | ||
9290 | |||
9291 | cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | ||
9292 | do_div(cfs_period_us, NSEC_PER_USEC); | ||
9293 | |||
9294 | return cfs_period_us; | ||
9295 | } | ||
9296 | |||
9297 | static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) | ||
9298 | { | ||
9299 | return tg_get_cfs_quota(cgroup_tg(cgrp)); | ||
9300 | } | ||
9301 | |||
9302 | static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, | ||
9303 | s64 cfs_quota_us) | ||
9304 | { | ||
9305 | return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); | ||
9306 | } | ||
9307 | |||
9308 | static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) | ||
9309 | { | ||
9310 | return tg_get_cfs_period(cgroup_tg(cgrp)); | ||
9311 | } | ||
9312 | |||
9313 | static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, | ||
9314 | u64 cfs_period_us) | ||
9315 | { | ||
9316 | return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); | ||
9317 | } | ||
9318 | |||
9319 | struct cfs_schedulable_data { | ||
9320 | struct task_group *tg; | ||
9321 | u64 period, quota; | ||
9322 | }; | ||
9323 | |||
9324 | /* | ||
9325 | * normalize group quota/period to be quota/max_period | ||
9326 | * note: units are usecs | ||
9327 | */ | ||
9328 | static u64 normalize_cfs_quota(struct task_group *tg, | ||
9329 | struct cfs_schedulable_data *d) | ||
9330 | { | ||
9331 | u64 quota, period; | ||
9332 | |||
9333 | if (tg == d->tg) { | ||
9334 | period = d->period; | ||
9335 | quota = d->quota; | ||
9336 | } else { | ||
9337 | period = tg_get_cfs_period(tg); | ||
9338 | quota = tg_get_cfs_quota(tg); | ||
9339 | } | ||
9340 | |||
9341 | /* note: these should typically be equivalent */ | ||
9342 | if (quota == RUNTIME_INF || quota == -1) | ||
9343 | return RUNTIME_INF; | ||
9344 | |||
9345 | return to_ratio(period, quota); | ||
9346 | } | ||
9347 | |||
9348 | static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | ||
9349 | { | ||
9350 | struct cfs_schedulable_data *d = data; | ||
9351 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
9352 | s64 quota = 0, parent_quota = -1; | ||
9353 | |||
9354 | if (!tg->parent) { | ||
9355 | quota = RUNTIME_INF; | ||
9356 | } else { | ||
9357 | struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); | ||
9358 | |||
9359 | quota = normalize_cfs_quota(tg, d); | ||
9360 | parent_quota = parent_b->hierarchal_quota; | ||
9361 | |||
9362 | /* | ||
9363 | * ensure max(child_quota) <= parent_quota, inherit when no | ||
9364 | * limit is set | ||
9365 | */ | ||
9366 | if (quota == RUNTIME_INF) | ||
9367 | quota = parent_quota; | ||
9368 | else if (parent_quota != RUNTIME_INF && quota > parent_quota) | ||
9369 | return -EINVAL; | ||
9370 | } | ||
9371 | cfs_b->hierarchal_quota = quota; | ||
9372 | |||
9373 | return 0; | ||
9374 | } | ||
9375 | |||
9376 | static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) | ||
9377 | { | ||
9378 | int ret; | ||
9379 | struct cfs_schedulable_data data = { | ||
9380 | .tg = tg, | ||
9381 | .period = period, | ||
9382 | .quota = quota, | ||
9383 | }; | ||
9384 | |||
9385 | if (quota != RUNTIME_INF) { | ||
9386 | do_div(data.period, NSEC_PER_USEC); | ||
9387 | do_div(data.quota, NSEC_PER_USEC); | ||
9388 | } | ||
9389 | |||
9390 | rcu_read_lock(); | ||
9391 | ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); | ||
9392 | rcu_read_unlock(); | ||
9393 | |||
9394 | return ret; | ||
9395 | } | ||
9396 | |||
9397 | static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, | ||
9398 | struct cgroup_map_cb *cb) | ||
9399 | { | ||
9400 | struct task_group *tg = cgroup_tg(cgrp); | ||
9401 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
9402 | |||
9403 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); | ||
9404 | cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); | ||
9405 | cb->fill(cb, "throttled_time", cfs_b->throttled_time); | ||
9406 | |||
9407 | return 0; | ||
9408 | } | ||
9409 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
9017 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 9410 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
9018 | 9411 | ||
9019 | #ifdef CONFIG_RT_GROUP_SCHED | 9412 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -9048,6 +9441,22 @@ static struct cftype cpu_files[] = { | |||
9048 | .write_u64 = cpu_shares_write_u64, | 9441 | .write_u64 = cpu_shares_write_u64, |
9049 | }, | 9442 | }, |
9050 | #endif | 9443 | #endif |
9444 | #ifdef CONFIG_CFS_BANDWIDTH | ||
9445 | { | ||
9446 | .name = "cfs_quota_us", | ||
9447 | .read_s64 = cpu_cfs_quota_read_s64, | ||
9448 | .write_s64 = cpu_cfs_quota_write_s64, | ||
9449 | }, | ||
9450 | { | ||
9451 | .name = "cfs_period_us", | ||
9452 | .read_u64 = cpu_cfs_period_read_u64, | ||
9453 | .write_u64 = cpu_cfs_period_write_u64, | ||
9454 | }, | ||
9455 | { | ||
9456 | .name = "stat", | ||
9457 | .read_map = cpu_stats_show, | ||
9458 | }, | ||
9459 | #endif | ||
9051 | #ifdef CONFIG_RT_GROUP_SCHED | 9460 | #ifdef CONFIG_RT_GROUP_SCHED |
9052 | { | 9461 | { |
9053 | .name = "rt_runtime_us", | 9462 | .name = "rt_runtime_us", |
@@ -9357,4 +9766,3 @@ struct cgroup_subsys cpuacct_subsys = { | |||
9357 | .subsys_id = cpuacct_subsys_id, | 9766 | .subsys_id = cpuacct_subsys_id, |
9358 | }; | 9767 | }; |
9359 | #endif /* CONFIG_CGROUP_CPUACCT */ | 9768 | #endif /* CONFIG_CGROUP_CPUACCT */ |
9360 | |||