summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/scheduler/sched-bwc.txt122
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/trace/events/sched.h9
-rw-r--r--init/Kconfig12
-rw-r--r--kernel/sched.c559
-rw-r--r--kernel/sched_cpupri.c89
-rw-r--r--kernel/sched_cpupri.h7
-rw-r--r--kernel/sched_fair.c716
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_rt.c91
-rw-r--r--kernel/sched_stoptask.c2
-rw-r--r--kernel/sysctl.c10
12 files changed, 1439 insertions, 187 deletions
diff --git a/Documentation/scheduler/sched-bwc.txt b/Documentation/scheduler/sched-bwc.txt
new file mode 100644
index 000000000000..f6b1873f68ab
--- /dev/null
+++ b/Documentation/scheduler/sched-bwc.txt
@@ -0,0 +1,122 @@
1CFS Bandwidth Control
2=====================
3
4[ This document only discusses CPU bandwidth control for SCHED_NORMAL.
5 The SCHED_RT case is covered in Documentation/scheduler/sched-rt-group.txt ]
6
7CFS bandwidth control is a CONFIG_FAIR_GROUP_SCHED extension which allows the
8specification of the maximum CPU bandwidth available to a group or hierarchy.
9
10The bandwidth allowed for a group is specified using a quota and period. Within
11each given "period" (microseconds), a group is allowed to consume only up to
12"quota" microseconds of CPU time. When the CPU bandwidth consumption of a
13group exceeds this limit (for that period), the tasks belonging to its
14hierarchy will be throttled and are not allowed to run again until the next
15period.
16
17A group's unused runtime is globally tracked, being refreshed with quota units
18above at each period boundary. As threads consume this bandwidth it is
19transferred to cpu-local "silos" on a demand basis. The amount transferred
20within each of these updates is tunable and described as the "slice".
21
22Management
23----------
24Quota and period are managed within the cpu subsystem via cgroupfs.
25
26cpu.cfs_quota_us: the total available run-time within a period (in microseconds)
27cpu.cfs_period_us: the length of a period (in microseconds)
28cpu.stat: exports throttling statistics [explained further below]
29
30The default values are:
31 cpu.cfs_period_us=100ms
32 cpu.cfs_quota=-1
33
34A value of -1 for cpu.cfs_quota_us indicates that the group does not have any
35bandwidth restriction in place, such a group is described as an unconstrained
36bandwidth group. This represents the traditional work-conserving behavior for
37CFS.
38
39Writing any (valid) positive value(s) will enact the specified bandwidth limit.
40The minimum quota allowed for the quota or period is 1ms. There is also an
41upper bound on the period length of 1s. Additional restrictions exist when
42bandwidth limits are used in a hierarchical fashion, these are explained in
43more detail below.
44
45Writing any negative value to cpu.cfs_quota_us will remove the bandwidth limit
46and return the group to an unconstrained state once more.
47
48Any updates to a group's bandwidth specification will result in it becoming
49unthrottled if it is in a constrained state.
50
51System wide settings
52--------------------
53For efficiency run-time is transferred between the global pool and CPU local
54"silos" in a batch fashion. This greatly reduces global accounting pressure
55on large systems. The amount transferred each time such an update is required
56is described as the "slice".
57
58This is tunable via procfs:
59 /proc/sys/kernel/sched_cfs_bandwidth_slice_us (default=5ms)
60
61Larger slice values will reduce transfer overheads, while smaller values allow
62for more fine-grained consumption.
63
64Statistics
65----------
66A group's bandwidth statistics are exported via 3 fields in cpu.stat.
67
68cpu.stat:
69- nr_periods: Number of enforcement intervals that have elapsed.
70- nr_throttled: Number of times the group has been throttled/limited.
71- throttled_time: The total time duration (in nanoseconds) for which entities
72 of the group have been throttled.
73
74This interface is read-only.
75
76Hierarchical considerations
77---------------------------
78The interface enforces that an individual entity's bandwidth is always
79attainable, that is: max(c_i) <= C. However, over-subscription in the
80aggregate case is explicitly allowed to enable work-conserving semantics
81within a hierarchy.
82 e.g. \Sum (c_i) may exceed C
83[ Where C is the parent's bandwidth, and c_i its children ]
84
85
86There are two ways in which a group may become throttled:
87 a. it fully consumes its own quota within a period
88 b. a parent's quota is fully consumed within its period
89
90In case b) above, even though the child may have runtime remaining it will not
91be allowed to until the parent's runtime is refreshed.
92
93Examples
94--------
951. Limit a group to 1 CPU worth of runtime.
96
97 If period is 250ms and quota is also 250ms, the group will get
98 1 CPU worth of runtime every 250ms.
99
100 # echo 250000 > cpu.cfs_quota_us /* quota = 250ms */
101 # echo 250000 > cpu.cfs_period_us /* period = 250ms */
102
1032. Limit a group to 2 CPUs worth of runtime on a multi-CPU machine.
104
105 With 500ms period and 1000ms quota, the group can get 2 CPUs worth of
106 runtime every 500ms.
107
108 # echo 1000000 > cpu.cfs_quota_us /* quota = 1000ms */
109 # echo 500000 > cpu.cfs_period_us /* period = 500ms */
110
111 The larger period here allows for increased burst capacity.
112
1133. Limit a group to 20% of 1 CPU.
114
115 With 50ms period, 10ms quota will be equivalent to 20% of 1 CPU.
116
117 # echo 10000 > cpu.cfs_quota_us /* quota = 10ms */
118 # echo 50000 > cpu.cfs_period_us /* period = 50ms */
119
120 By using a small period here we are ensuring a consistent latency
121 response at the expense of burst capacity.
122
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 41d0237fd449..9fda2888a6ab 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2039,6 +2039,10 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { }
2039static inline void sched_autogroup_exit(struct signal_struct *sig) { } 2039static inline void sched_autogroup_exit(struct signal_struct *sig) { }
2040#endif 2040#endif
2041 2041
2042#ifdef CONFIG_CFS_BANDWIDTH
2043extern unsigned int sysctl_sched_cfs_bandwidth_slice;
2044#endif
2045
2042#ifdef CONFIG_RT_MUTEXES 2046#ifdef CONFIG_RT_MUTEXES
2043extern int rt_mutex_getprio(struct task_struct *p); 2047extern int rt_mutex_getprio(struct task_struct *p);
2044extern void rt_mutex_setprio(struct task_struct *p, int prio); 2048extern void rt_mutex_setprio(struct task_struct *p, int prio);
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index f6334782a593..959ff18b63b6 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -100,7 +100,7 @@ static inline long __trace_sched_switch_state(struct task_struct *p)
100 * For all intents and purposes a preempted task is a running task. 100 * For all intents and purposes a preempted task is a running task.
101 */ 101 */
102 if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE) 102 if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)
103 state = TASK_RUNNING; 103 state = TASK_RUNNING | TASK_STATE_MAX;
104#endif 104#endif
105 105
106 return state; 106 return state;
@@ -137,13 +137,14 @@ TRACE_EVENT(sched_switch,
137 __entry->next_prio = next->prio; 137 __entry->next_prio = next->prio;
138 ), 138 ),
139 139
140 TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s ==> next_comm=%s next_pid=%d next_prio=%d", 140 TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
141 __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, 141 __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
142 __entry->prev_state ? 142 __entry->prev_state & (TASK_STATE_MAX-1) ?
143 __print_flags(__entry->prev_state, "|", 143 __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
144 { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, 144 { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
145 { 16, "Z" }, { 32, "X" }, { 64, "x" }, 145 { 16, "Z" }, { 32, "X" }, { 64, "x" },
146 { 128, "W" }) : "R", 146 { 128, "W" }) : "R",
147 __entry->prev_state & TASK_STATE_MAX ? "+" : "",
147 __entry->next_comm, __entry->next_pid, __entry->next_prio) 148 __entry->next_comm, __entry->next_pid, __entry->next_prio)
148); 149);
149 150
diff --git a/init/Kconfig b/init/Kconfig
index d62778390e55..d19b3a77ab44 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -715,6 +715,18 @@ config FAIR_GROUP_SCHED
715 depends on CGROUP_SCHED 715 depends on CGROUP_SCHED
716 default CGROUP_SCHED 716 default CGROUP_SCHED
717 717
718config CFS_BANDWIDTH
719 bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
720 depends on EXPERIMENTAL
721 depends on FAIR_GROUP_SCHED
722 default n
723 help
724 This option allows users to define CPU bandwidth rates (limits) for
725 tasks running within the fair group scheduler. Groups with no limit
726 set are considered to be unconstrained and will run with no
727 restriction.
728 See tip/Documentation/scheduler/sched-bwc.txt for more information.
729
718config RT_GROUP_SCHED 730config RT_GROUP_SCHED
719 bool "Group scheduling for SCHED_RR/FIFO" 731 bool "Group scheduling for SCHED_RR/FIFO"
720 depends on EXPERIMENTAL 732 depends on EXPERIMENTAL
diff --git a/kernel/sched.c b/kernel/sched.c
index b50b0f0c9aa9..c5cf15e1eb57 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void)
196 return sysctl_sched_rt_runtime >= 0; 196 return sysctl_sched_rt_runtime >= 0;
197} 197}
198 198
199static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 199static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
200{ 200{
201 ktime_t now; 201 unsigned long delta;
202 ktime_t soft, hard, now;
203
204 for (;;) {
205 if (hrtimer_active(period_timer))
206 break;
207
208 now = hrtimer_cb_get_time(period_timer);
209 hrtimer_forward(period_timer, now, period);
210
211 soft = hrtimer_get_softexpires(period_timer);
212 hard = hrtimer_get_expires(period_timer);
213 delta = ktime_to_ns(ktime_sub(hard, soft));
214 __hrtimer_start_range_ns(period_timer, soft, delta,
215 HRTIMER_MODE_ABS_PINNED, 0);
216 }
217}
202 218
219static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
220{
203 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 221 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
204 return; 222 return;
205 223
@@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
207 return; 225 return;
208 226
209 raw_spin_lock(&rt_b->rt_runtime_lock); 227 raw_spin_lock(&rt_b->rt_runtime_lock);
210 for (;;) { 228 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
211 unsigned long delta;
212 ktime_t soft, hard;
213
214 if (hrtimer_active(&rt_b->rt_period_timer))
215 break;
216
217 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
218 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
219
220 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
221 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
222 delta = ktime_to_ns(ktime_sub(hard, soft));
223 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
224 HRTIMER_MODE_ABS_PINNED, 0);
225 }
226 raw_spin_unlock(&rt_b->rt_runtime_lock); 229 raw_spin_unlock(&rt_b->rt_runtime_lock);
227} 230}
228 231
@@ -247,6 +250,24 @@ struct cfs_rq;
247 250
248static LIST_HEAD(task_groups); 251static LIST_HEAD(task_groups);
249 252
253struct cfs_bandwidth {
254#ifdef CONFIG_CFS_BANDWIDTH
255 raw_spinlock_t lock;
256 ktime_t period;
257 u64 quota, runtime;
258 s64 hierarchal_quota;
259 u64 runtime_expires;
260
261 int idle, timer_active;
262 struct hrtimer period_timer, slack_timer;
263 struct list_head throttled_cfs_rq;
264
265 /* statistics */
266 int nr_periods, nr_throttled;
267 u64 throttled_time;
268#endif
269};
270
250/* task group related information */ 271/* task group related information */
251struct task_group { 272struct task_group {
252 struct cgroup_subsys_state css; 273 struct cgroup_subsys_state css;
@@ -278,6 +299,8 @@ struct task_group {
278#ifdef CONFIG_SCHED_AUTOGROUP 299#ifdef CONFIG_SCHED_AUTOGROUP
279 struct autogroup *autogroup; 300 struct autogroup *autogroup;
280#endif 301#endif
302
303 struct cfs_bandwidth cfs_bandwidth;
281}; 304};
282 305
283/* task_group_lock serializes the addition/removal of task groups */ 306/* task_group_lock serializes the addition/removal of task groups */
@@ -311,7 +334,7 @@ struct task_group root_task_group;
311/* CFS-related fields in a runqueue */ 334/* CFS-related fields in a runqueue */
312struct cfs_rq { 335struct cfs_rq {
313 struct load_weight load; 336 struct load_weight load;
314 unsigned long nr_running; 337 unsigned long nr_running, h_nr_running;
315 338
316 u64 exec_clock; 339 u64 exec_clock;
317 u64 min_vruntime; 340 u64 min_vruntime;
@@ -377,9 +400,120 @@ struct cfs_rq {
377 400
378 unsigned long load_contribution; 401 unsigned long load_contribution;
379#endif 402#endif
403#ifdef CONFIG_CFS_BANDWIDTH
404 int runtime_enabled;
405 u64 runtime_expires;
406 s64 runtime_remaining;
407
408 u64 throttled_timestamp;
409 int throttled, throttle_count;
410 struct list_head throttled_list;
411#endif
380#endif 412#endif
381}; 413};
382 414
415#ifdef CONFIG_FAIR_GROUP_SCHED
416#ifdef CONFIG_CFS_BANDWIDTH
417static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
418{
419 return &tg->cfs_bandwidth;
420}
421
422static inline u64 default_cfs_period(void);
423static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
424static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
425
426static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
427{
428 struct cfs_bandwidth *cfs_b =
429 container_of(timer, struct cfs_bandwidth, slack_timer);
430 do_sched_cfs_slack_timer(cfs_b);
431
432 return HRTIMER_NORESTART;
433}
434
435static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
436{
437 struct cfs_bandwidth *cfs_b =
438 container_of(timer, struct cfs_bandwidth, period_timer);
439 ktime_t now;
440 int overrun;
441 int idle = 0;
442
443 for (;;) {
444 now = hrtimer_cb_get_time(timer);
445 overrun = hrtimer_forward(timer, now, cfs_b->period);
446
447 if (!overrun)
448 break;
449
450 idle = do_sched_cfs_period_timer(cfs_b, overrun);
451 }
452
453 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
454}
455
456static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
457{
458 raw_spin_lock_init(&cfs_b->lock);
459 cfs_b->runtime = 0;
460 cfs_b->quota = RUNTIME_INF;
461 cfs_b->period = ns_to_ktime(default_cfs_period());
462
463 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
464 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
465 cfs_b->period_timer.function = sched_cfs_period_timer;
466 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
467 cfs_b->slack_timer.function = sched_cfs_slack_timer;
468}
469
470static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
471{
472 cfs_rq->runtime_enabled = 0;
473 INIT_LIST_HEAD(&cfs_rq->throttled_list);
474}
475
476/* requires cfs_b->lock, may release to reprogram timer */
477static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
478{
479 /*
480 * The timer may be active because we're trying to set a new bandwidth
481 * period or because we're racing with the tear-down path
482 * (timer_active==0 becomes visible before the hrtimer call-back
483 * terminates). In either case we ensure that it's re-programmed
484 */
485 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
486 raw_spin_unlock(&cfs_b->lock);
487 /* ensure cfs_b->lock is available while we wait */
488 hrtimer_cancel(&cfs_b->period_timer);
489
490 raw_spin_lock(&cfs_b->lock);
491 /* if someone else restarted the timer then we're done */
492 if (cfs_b->timer_active)
493 return;
494 }
495
496 cfs_b->timer_active = 1;
497 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
498}
499
500static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
501{
502 hrtimer_cancel(&cfs_b->period_timer);
503 hrtimer_cancel(&cfs_b->slack_timer);
504}
505#else
506static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
507static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
508static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
509
510static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
511{
512 return NULL;
513}
514#endif /* CONFIG_CFS_BANDWIDTH */
515#endif /* CONFIG_FAIR_GROUP_SCHED */
516
383/* Real-Time classes' related field in a runqueue: */ 517/* Real-Time classes' related field in a runqueue: */
384struct rt_rq { 518struct rt_rq {
385 struct rt_prio_array active; 519 struct rt_prio_array active;
@@ -520,8 +654,6 @@ struct rq {
520 int cpu; 654 int cpu;
521 int online; 655 int online;
522 656
523 unsigned long avg_load_per_task;
524
525 u64 rt_avg; 657 u64 rt_avg;
526 u64 age_stamp; 658 u64 age_stamp;
527 u64 idle_stamp; 659 u64 idle_stamp;
@@ -1471,24 +1603,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1471 update_load_sub(&rq->load, load); 1603 update_load_sub(&rq->load, load);
1472} 1604}
1473 1605
1474#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) 1606#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1607 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1475typedef int (*tg_visitor)(struct task_group *, void *); 1608typedef int (*tg_visitor)(struct task_group *, void *);
1476 1609
1477/* 1610/*
1478 * Iterate the full tree, calling @down when first entering a node and @up when 1611 * Iterate task_group tree rooted at *from, calling @down when first entering a
1479 * leaving it for the final time. 1612 * node and @up when leaving it for the final time.
1613 *
1614 * Caller must hold rcu_lock or sufficient equivalent.
1480 */ 1615 */
1481static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) 1616static int walk_tg_tree_from(struct task_group *from,
1617 tg_visitor down, tg_visitor up, void *data)
1482{ 1618{
1483 struct task_group *parent, *child; 1619 struct task_group *parent, *child;
1484 int ret; 1620 int ret;
1485 1621
1486 rcu_read_lock(); 1622 parent = from;
1487 parent = &root_task_group; 1623
1488down: 1624down:
1489 ret = (*down)(parent, data); 1625 ret = (*down)(parent, data);
1490 if (ret) 1626 if (ret)
1491 goto out_unlock; 1627 goto out;
1492 list_for_each_entry_rcu(child, &parent->children, siblings) { 1628 list_for_each_entry_rcu(child, &parent->children, siblings) {
1493 parent = child; 1629 parent = child;
1494 goto down; 1630 goto down;
@@ -1497,19 +1633,29 @@ up:
1497 continue; 1633 continue;
1498 } 1634 }
1499 ret = (*up)(parent, data); 1635 ret = (*up)(parent, data);
1500 if (ret) 1636 if (ret || parent == from)
1501 goto out_unlock; 1637 goto out;
1502 1638
1503 child = parent; 1639 child = parent;
1504 parent = parent->parent; 1640 parent = parent->parent;
1505 if (parent) 1641 if (parent)
1506 goto up; 1642 goto up;
1507out_unlock: 1643out:
1508 rcu_read_unlock();
1509
1510 return ret; 1644 return ret;
1511} 1645}
1512 1646
1647/*
1648 * Iterate the full tree, calling @down when first entering a node and @up when
1649 * leaving it for the final time.
1650 *
1651 * Caller must hold rcu_lock or sufficient equivalent.
1652 */
1653
1654static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1655{
1656 return walk_tg_tree_from(&root_task_group, down, up, data);
1657}
1658
1513static int tg_nop(struct task_group *tg, void *data) 1659static int tg_nop(struct task_group *tg, void *data)
1514{ 1660{
1515 return 0; 1661 return 0;
@@ -1569,11 +1715,9 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1569 unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 1715 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1570 1716
1571 if (nr_running) 1717 if (nr_running)
1572 rq->avg_load_per_task = rq->load.weight / nr_running; 1718 return rq->load.weight / nr_running;
1573 else
1574 rq->avg_load_per_task = 0;
1575 1719
1576 return rq->avg_load_per_task; 1720 return 0;
1577} 1721}
1578 1722
1579#ifdef CONFIG_PREEMPT 1723#ifdef CONFIG_PREEMPT
@@ -1806,7 +1950,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1806 rq->nr_uninterruptible--; 1950 rq->nr_uninterruptible--;
1807 1951
1808 enqueue_task(rq, p, flags); 1952 enqueue_task(rq, p, flags);
1809 inc_nr_running(rq);
1810} 1953}
1811 1954
1812/* 1955/*
@@ -1818,7 +1961,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1818 rq->nr_uninterruptible++; 1961 rq->nr_uninterruptible++;
1819 1962
1820 dequeue_task(rq, p, flags); 1963 dequeue_task(rq, p, flags);
1821 dec_nr_running(rq);
1822} 1964}
1823 1965
1824#ifdef CONFIG_IRQ_TIME_ACCOUNTING 1966#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -2848,19 +2990,23 @@ void sched_fork(struct task_struct *p)
2848 p->state = TASK_RUNNING; 2990 p->state = TASK_RUNNING;
2849 2991
2850 /* 2992 /*
2993 * Make sure we do not leak PI boosting priority to the child.
2994 */
2995 p->prio = current->normal_prio;
2996
2997 /*
2851 * Revert to default priority/policy on fork if requested. 2998 * Revert to default priority/policy on fork if requested.
2852 */ 2999 */
2853 if (unlikely(p->sched_reset_on_fork)) { 3000 if (unlikely(p->sched_reset_on_fork)) {
2854 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { 3001 if (task_has_rt_policy(p)) {
2855 p->policy = SCHED_NORMAL; 3002 p->policy = SCHED_NORMAL;
2856 p->normal_prio = p->static_prio;
2857 }
2858
2859 if (PRIO_TO_NICE(p->static_prio) < 0) {
2860 p->static_prio = NICE_TO_PRIO(0); 3003 p->static_prio = NICE_TO_PRIO(0);
2861 p->normal_prio = p->static_prio; 3004 p->rt_priority = 0;
2862 set_load_weight(p); 3005 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2863 } 3006 p->static_prio = NICE_TO_PRIO(0);
3007
3008 p->prio = p->normal_prio = __normal_prio(p);
3009 set_load_weight(p);
2864 3010
2865 /* 3011 /*
2866 * We don't need the reset flag anymore after the fork. It has 3012 * We don't need the reset flag anymore after the fork. It has
@@ -2869,11 +3015,6 @@ void sched_fork(struct task_struct *p)
2869 p->sched_reset_on_fork = 0; 3015 p->sched_reset_on_fork = 0;
2870 } 3016 }
2871 3017
2872 /*
2873 * Make sure we do not leak PI boosting priority to the child.
2874 */
2875 p->prio = current->normal_prio;
2876
2877 if (!rt_prio(p->prio)) 3018 if (!rt_prio(p->prio))
2878 p->sched_class = &fair_sched_class; 3019 p->sched_class = &fair_sched_class;
2879 3020
@@ -4239,7 +4380,7 @@ pick_next_task(struct rq *rq)
4239 * Optimization: we know that if all tasks are in 4380 * Optimization: we know that if all tasks are in
4240 * the fair class we can call that function directly: 4381 * the fair class we can call that function directly:
4241 */ 4382 */
4242 if (likely(rq->nr_running == rq->cfs.nr_running)) { 4383 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
4243 p = fair_sched_class.pick_next_task(rq); 4384 p = fair_sched_class.pick_next_task(rq);
4244 if (likely(p)) 4385 if (likely(p))
4245 return p; 4386 return p;
@@ -6197,6 +6338,30 @@ static void calc_global_load_remove(struct rq *rq)
6197 rq->calc_load_active = 0; 6338 rq->calc_load_active = 0;
6198} 6339}
6199 6340
6341#ifdef CONFIG_CFS_BANDWIDTH
6342static void unthrottle_offline_cfs_rqs(struct rq *rq)
6343{
6344 struct cfs_rq *cfs_rq;
6345
6346 for_each_leaf_cfs_rq(rq, cfs_rq) {
6347 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6348
6349 if (!cfs_rq->runtime_enabled)
6350 continue;
6351
6352 /*
6353 * clock_task is not advancing so we just need to make sure
6354 * there's some valid quota amount
6355 */
6356 cfs_rq->runtime_remaining = cfs_b->quota;
6357 if (cfs_rq_throttled(cfs_rq))
6358 unthrottle_cfs_rq(cfs_rq);
6359 }
6360}
6361#else
6362static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6363#endif
6364
6200/* 6365/*
6201 * Migrate all tasks from the rq, sleeping tasks will be migrated by 6366 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6202 * try_to_wake_up()->select_task_rq(). 6367 * try_to_wake_up()->select_task_rq().
@@ -6222,6 +6387,9 @@ static void migrate_tasks(unsigned int dead_cpu)
6222 */ 6387 */
6223 rq->stop = NULL; 6388 rq->stop = NULL;
6224 6389
6390 /* Ensure any throttled groups are reachable by pick_next_task */
6391 unthrottle_offline_cfs_rqs(rq);
6392
6225 for ( ; ; ) { 6393 for ( ; ; ) {
6226 /* 6394 /*
6227 * There's this thread running, bail when that's the only 6395 * There's this thread running, bail when that's the only
@@ -7965,6 +8133,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7965 /* allow initial update_cfs_load() to truncate */ 8133 /* allow initial update_cfs_load() to truncate */
7966 cfs_rq->load_stamp = 1; 8134 cfs_rq->load_stamp = 1;
7967#endif 8135#endif
8136 init_cfs_rq_runtime(cfs_rq);
7968 8137
7969 tg->cfs_rq[cpu] = cfs_rq; 8138 tg->cfs_rq[cpu] = cfs_rq;
7970 tg->se[cpu] = se; 8139 tg->se[cpu] = se;
@@ -8104,6 +8273,7 @@ void __init sched_init(void)
8104 * We achieve this by letting root_task_group's tasks sit 8273 * We achieve this by letting root_task_group's tasks sit
8105 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 8274 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
8106 */ 8275 */
8276 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
8107 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 8277 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8108#endif /* CONFIG_FAIR_GROUP_SCHED */ 8278#endif /* CONFIG_FAIR_GROUP_SCHED */
8109 8279
@@ -8345,6 +8515,8 @@ static void free_fair_sched_group(struct task_group *tg)
8345{ 8515{
8346 int i; 8516 int i;
8347 8517
8518 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8519
8348 for_each_possible_cpu(i) { 8520 for_each_possible_cpu(i) {
8349 if (tg->cfs_rq) 8521 if (tg->cfs_rq)
8350 kfree(tg->cfs_rq[i]); 8522 kfree(tg->cfs_rq[i]);
@@ -8372,6 +8544,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8372 8544
8373 tg->shares = NICE_0_LOAD; 8545 tg->shares = NICE_0_LOAD;
8374 8546
8547 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8548
8375 for_each_possible_cpu(i) { 8549 for_each_possible_cpu(i) {
8376 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8550 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8377 GFP_KERNEL, cpu_to_node(i)); 8551 GFP_KERNEL, cpu_to_node(i));
@@ -8647,12 +8821,7 @@ unsigned long sched_group_shares(struct task_group *tg)
8647} 8821}
8648#endif 8822#endif
8649 8823
8650#ifdef CONFIG_RT_GROUP_SCHED 8824#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
8651/*
8652 * Ensure that the real time constraints are schedulable.
8653 */
8654static DEFINE_MUTEX(rt_constraints_mutex);
8655
8656static unsigned long to_ratio(u64 period, u64 runtime) 8825static unsigned long to_ratio(u64 period, u64 runtime)
8657{ 8826{
8658 if (runtime == RUNTIME_INF) 8827 if (runtime == RUNTIME_INF)
@@ -8660,6 +8829,13 @@ static unsigned long to_ratio(u64 period, u64 runtime)
8660 8829
8661 return div64_u64(runtime << 20, period); 8830 return div64_u64(runtime << 20, period);
8662} 8831}
8832#endif
8833
8834#ifdef CONFIG_RT_GROUP_SCHED
8835/*
8836 * Ensure that the real time constraints are schedulable.
8837 */
8838static DEFINE_MUTEX(rt_constraints_mutex);
8663 8839
8664/* Must be called with tasklist_lock held */ 8840/* Must be called with tasklist_lock held */
8665static inline int tg_has_rt_tasks(struct task_group *tg) 8841static inline int tg_has_rt_tasks(struct task_group *tg)
@@ -8680,7 +8856,7 @@ struct rt_schedulable_data {
8680 u64 rt_runtime; 8856 u64 rt_runtime;
8681}; 8857};
8682 8858
8683static int tg_schedulable(struct task_group *tg, void *data) 8859static int tg_rt_schedulable(struct task_group *tg, void *data)
8684{ 8860{
8685 struct rt_schedulable_data *d = data; 8861 struct rt_schedulable_data *d = data;
8686 struct task_group *child; 8862 struct task_group *child;
@@ -8738,16 +8914,22 @@ static int tg_schedulable(struct task_group *tg, void *data)
8738 8914
8739static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8915static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8740{ 8916{
8917 int ret;
8918
8741 struct rt_schedulable_data data = { 8919 struct rt_schedulable_data data = {
8742 .tg = tg, 8920 .tg = tg,
8743 .rt_period = period, 8921 .rt_period = period,
8744 .rt_runtime = runtime, 8922 .rt_runtime = runtime,
8745 }; 8923 };
8746 8924
8747 return walk_tg_tree(tg_schedulable, tg_nop, &data); 8925 rcu_read_lock();
8926 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
8927 rcu_read_unlock();
8928
8929 return ret;
8748} 8930}
8749 8931
8750static int tg_set_bandwidth(struct task_group *tg, 8932static int tg_set_rt_bandwidth(struct task_group *tg,
8751 u64 rt_period, u64 rt_runtime) 8933 u64 rt_period, u64 rt_runtime)
8752{ 8934{
8753 int i, err = 0; 8935 int i, err = 0;
@@ -8786,7 +8968,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8786 if (rt_runtime_us < 0) 8968 if (rt_runtime_us < 0)
8787 rt_runtime = RUNTIME_INF; 8969 rt_runtime = RUNTIME_INF;
8788 8970
8789 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8971 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8790} 8972}
8791 8973
8792long sched_group_rt_runtime(struct task_group *tg) 8974long sched_group_rt_runtime(struct task_group *tg)
@@ -8811,7 +8993,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8811 if (rt_period == 0) 8993 if (rt_period == 0)
8812 return -EINVAL; 8994 return -EINVAL;
8813 8995
8814 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8996 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
8815} 8997}
8816 8998
8817long sched_group_rt_period(struct task_group *tg) 8999long sched_group_rt_period(struct task_group *tg)
@@ -9001,6 +9183,238 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
9001 9183
9002 return (u64) scale_load_down(tg->shares); 9184 return (u64) scale_load_down(tg->shares);
9003} 9185}
9186
9187#ifdef CONFIG_CFS_BANDWIDTH
9188static DEFINE_MUTEX(cfs_constraints_mutex);
9189
9190const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
9191const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
9192
9193static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9194
9195static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9196{
9197 int i, ret = 0, runtime_enabled;
9198 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9199
9200 if (tg == &root_task_group)
9201 return -EINVAL;
9202
9203 /*
9204 * Ensure we have at some amount of bandwidth every period. This is
9205 * to prevent reaching a state of large arrears when throttled via
9206 * entity_tick() resulting in prolonged exit starvation.
9207 */
9208 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
9209 return -EINVAL;
9210
9211 /*
9212 * Likewise, bound things on the otherside by preventing insane quota
9213 * periods. This also allows us to normalize in computing quota
9214 * feasibility.
9215 */
9216 if (period > max_cfs_quota_period)
9217 return -EINVAL;
9218
9219 mutex_lock(&cfs_constraints_mutex);
9220 ret = __cfs_schedulable(tg, period, quota);
9221 if (ret)
9222 goto out_unlock;
9223
9224 runtime_enabled = quota != RUNTIME_INF;
9225 raw_spin_lock_irq(&cfs_b->lock);
9226 cfs_b->period = ns_to_ktime(period);
9227 cfs_b->quota = quota;
9228
9229 __refill_cfs_bandwidth_runtime(cfs_b);
9230 /* restart the period timer (if active) to handle new period expiry */
9231 if (runtime_enabled && cfs_b->timer_active) {
9232 /* force a reprogram */
9233 cfs_b->timer_active = 0;
9234 __start_cfs_bandwidth(cfs_b);
9235 }
9236 raw_spin_unlock_irq(&cfs_b->lock);
9237
9238 for_each_possible_cpu(i) {
9239 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9240 struct rq *rq = rq_of(cfs_rq);
9241
9242 raw_spin_lock_irq(&rq->lock);
9243 cfs_rq->runtime_enabled = runtime_enabled;
9244 cfs_rq->runtime_remaining = 0;
9245
9246 if (cfs_rq_throttled(cfs_rq))
9247 unthrottle_cfs_rq(cfs_rq);
9248 raw_spin_unlock_irq(&rq->lock);
9249 }
9250out_unlock:
9251 mutex_unlock(&cfs_constraints_mutex);
9252
9253 return ret;
9254}
9255
9256int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9257{
9258 u64 quota, period;
9259
9260 period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
9261 if (cfs_quota_us < 0)
9262 quota = RUNTIME_INF;
9263 else
9264 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
9265
9266 return tg_set_cfs_bandwidth(tg, period, quota);
9267}
9268
9269long tg_get_cfs_quota(struct task_group *tg)
9270{
9271 u64 quota_us;
9272
9273 if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
9274 return -1;
9275
9276 quota_us = tg_cfs_bandwidth(tg)->quota;
9277 do_div(quota_us, NSEC_PER_USEC);
9278
9279 return quota_us;
9280}
9281
9282int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9283{
9284 u64 quota, period;
9285
9286 period = (u64)cfs_period_us * NSEC_PER_USEC;
9287 quota = tg_cfs_bandwidth(tg)->quota;
9288
9289 if (period <= 0)
9290 return -EINVAL;
9291
9292 return tg_set_cfs_bandwidth(tg, period, quota);
9293}
9294
9295long tg_get_cfs_period(struct task_group *tg)
9296{
9297 u64 cfs_period_us;
9298
9299 cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
9300 do_div(cfs_period_us, NSEC_PER_USEC);
9301
9302 return cfs_period_us;
9303}
9304
9305static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
9306{
9307 return tg_get_cfs_quota(cgroup_tg(cgrp));
9308}
9309
9310static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
9311 s64 cfs_quota_us)
9312{
9313 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
9314}
9315
9316static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
9317{
9318 return tg_get_cfs_period(cgroup_tg(cgrp));
9319}
9320
9321static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9322 u64 cfs_period_us)
9323{
9324 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
9325}
9326
9327struct cfs_schedulable_data {
9328 struct task_group *tg;
9329 u64 period, quota;
9330};
9331
9332/*
9333 * normalize group quota/period to be quota/max_period
9334 * note: units are usecs
9335 */
9336static u64 normalize_cfs_quota(struct task_group *tg,
9337 struct cfs_schedulable_data *d)
9338{
9339 u64 quota, period;
9340
9341 if (tg == d->tg) {
9342 period = d->period;
9343 quota = d->quota;
9344 } else {
9345 period = tg_get_cfs_period(tg);
9346 quota = tg_get_cfs_quota(tg);
9347 }
9348
9349 /* note: these should typically be equivalent */
9350 if (quota == RUNTIME_INF || quota == -1)
9351 return RUNTIME_INF;
9352
9353 return to_ratio(period, quota);
9354}
9355
9356static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9357{
9358 struct cfs_schedulable_data *d = data;
9359 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9360 s64 quota = 0, parent_quota = -1;
9361
9362 if (!tg->parent) {
9363 quota = RUNTIME_INF;
9364 } else {
9365 struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
9366
9367 quota = normalize_cfs_quota(tg, d);
9368 parent_quota = parent_b->hierarchal_quota;
9369
9370 /*
9371 * ensure max(child_quota) <= parent_quota, inherit when no
9372 * limit is set
9373 */
9374 if (quota == RUNTIME_INF)
9375 quota = parent_quota;
9376 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
9377 return -EINVAL;
9378 }
9379 cfs_b->hierarchal_quota = quota;
9380
9381 return 0;
9382}
9383
9384static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
9385{
9386 int ret;
9387 struct cfs_schedulable_data data = {
9388 .tg = tg,
9389 .period = period,
9390 .quota = quota,
9391 };
9392
9393 if (quota != RUNTIME_INF) {
9394 do_div(data.period, NSEC_PER_USEC);
9395 do_div(data.quota, NSEC_PER_USEC);
9396 }
9397
9398 rcu_read_lock();
9399 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
9400 rcu_read_unlock();
9401
9402 return ret;
9403}
9404
9405static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
9406 struct cgroup_map_cb *cb)
9407{
9408 struct task_group *tg = cgroup_tg(cgrp);
9409 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9410
9411 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
9412 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
9413 cb->fill(cb, "throttled_time", cfs_b->throttled_time);
9414
9415 return 0;
9416}
9417#endif /* CONFIG_CFS_BANDWIDTH */
9004#endif /* CONFIG_FAIR_GROUP_SCHED */ 9418#endif /* CONFIG_FAIR_GROUP_SCHED */
9005 9419
9006#ifdef CONFIG_RT_GROUP_SCHED 9420#ifdef CONFIG_RT_GROUP_SCHED
@@ -9035,6 +9449,22 @@ static struct cftype cpu_files[] = {
9035 .write_u64 = cpu_shares_write_u64, 9449 .write_u64 = cpu_shares_write_u64,
9036 }, 9450 },
9037#endif 9451#endif
9452#ifdef CONFIG_CFS_BANDWIDTH
9453 {
9454 .name = "cfs_quota_us",
9455 .read_s64 = cpu_cfs_quota_read_s64,
9456 .write_s64 = cpu_cfs_quota_write_s64,
9457 },
9458 {
9459 .name = "cfs_period_us",
9460 .read_u64 = cpu_cfs_period_read_u64,
9461 .write_u64 = cpu_cfs_period_write_u64,
9462 },
9463 {
9464 .name = "stat",
9465 .read_map = cpu_stats_show,
9466 },
9467#endif
9038#ifdef CONFIG_RT_GROUP_SCHED 9468#ifdef CONFIG_RT_GROUP_SCHED
9039 { 9469 {
9040 .name = "rt_runtime_us", 9470 .name = "rt_runtime_us",
@@ -9344,4 +9774,3 @@ struct cgroup_subsys cpuacct_subsys = {
9344 .subsys_id = cpuacct_subsys_id, 9774 .subsys_id = cpuacct_subsys_id,
9345}; 9775};
9346#endif /* CONFIG_CGROUP_CPUACCT */ 9776#endif /* CONFIG_CGROUP_CPUACCT */
9347
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 2722dc1b4138..a86cf9d9eb11 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,6 @@ static int convert_prio(int prio)
47 return cpupri; 47 return cpupri;
48} 48}
49 49
50#define for_each_cpupri_active(array, idx) \
51 for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
52
53/** 50/**
54 * cpupri_find - find the best (lowest-pri) CPU in the system 51 * cpupri_find - find the best (lowest-pri) CPU in the system
55 * @cp: The cpupri context 52 * @cp: The cpupri context
@@ -71,11 +68,38 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
71 int idx = 0; 68 int idx = 0;
72 int task_pri = convert_prio(p->prio); 69 int task_pri = convert_prio(p->prio);
73 70
74 for_each_cpupri_active(cp->pri_active, idx) { 71 if (task_pri >= MAX_RT_PRIO)
75 struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 72 return 0;
76 73
77 if (idx >= task_pri) 74 for (idx = 0; idx < task_pri; idx++) {
78 break; 75 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
76 int skip = 0;
77
78 if (!atomic_read(&(vec)->count))
79 skip = 1;
80 /*
81 * When looking at the vector, we need to read the counter,
82 * do a memory barrier, then read the mask.
83 *
84 * Note: This is still all racey, but we can deal with it.
85 * Ideally, we only want to look at masks that are set.
86 *
87 * If a mask is not set, then the only thing wrong is that we
88 * did a little more work than necessary.
89 *
90 * If we read a zero count but the mask is set, because of the
91 * memory barriers, that can only happen when the highest prio
92 * task for a run queue has left the run queue, in which case,
93 * it will be followed by a pull. If the task we are processing
94 * fails to find a proper place to go, that pull request will
95 * pull this task if the run queue is running at a lower
96 * priority.
97 */
98 smp_rmb();
99
100 /* Need to do the rmb for every iteration */
101 if (skip)
102 continue;
79 103
80 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 104 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
81 continue; 105 continue;
@@ -115,7 +139,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
115{ 139{
116 int *currpri = &cp->cpu_to_pri[cpu]; 140 int *currpri = &cp->cpu_to_pri[cpu];
117 int oldpri = *currpri; 141 int oldpri = *currpri;
118 unsigned long flags; 142 int do_mb = 0;
119 143
120 newpri = convert_prio(newpri); 144 newpri = convert_prio(newpri);
121 145
@@ -128,32 +152,46 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
128 * If the cpu was currently mapped to a different value, we 152 * If the cpu was currently mapped to a different value, we
129 * need to map it to the new value then remove the old value. 153 * need to map it to the new value then remove the old value.
130 * Note, we must add the new value first, otherwise we risk the 154 * Note, we must add the new value first, otherwise we risk the
131 * cpu being cleared from pri_active, and this cpu could be 155 * cpu being missed by the priority loop in cpupri_find.
132 * missed for a push or pull.
133 */ 156 */
134 if (likely(newpri != CPUPRI_INVALID)) { 157 if (likely(newpri != CPUPRI_INVALID)) {
135 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 158 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
136 159
137 raw_spin_lock_irqsave(&vec->lock, flags);
138
139 cpumask_set_cpu(cpu, vec->mask); 160 cpumask_set_cpu(cpu, vec->mask);
140 vec->count++; 161 /*
141 if (vec->count == 1) 162 * When adding a new vector, we update the mask first,
142 set_bit(newpri, cp->pri_active); 163 * do a write memory barrier, and then update the count, to
143 164 * make sure the vector is visible when count is set.
144 raw_spin_unlock_irqrestore(&vec->lock, flags); 165 */
166 smp_mb__before_atomic_inc();
167 atomic_inc(&(vec)->count);
168 do_mb = 1;
145 } 169 }
146 if (likely(oldpri != CPUPRI_INVALID)) { 170 if (likely(oldpri != CPUPRI_INVALID)) {
147 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; 171 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
148 172
149 raw_spin_lock_irqsave(&vec->lock, flags); 173 /*
150 174 * Because the order of modification of the vec->count
151 vec->count--; 175 * is important, we must make sure that the update
152 if (!vec->count) 176 * of the new prio is seen before we decrement the
153 clear_bit(oldpri, cp->pri_active); 177 * old prio. This makes sure that the loop sees
178 * one or the other when we raise the priority of
179 * the run queue. We don't care about when we lower the
180 * priority, as that will trigger an rt pull anyway.
181 *
182 * We only need to do a memory barrier if we updated
183 * the new priority vec.
184 */
185 if (do_mb)
186 smp_mb__after_atomic_inc();
187
188 /*
189 * When removing from the vector, we decrement the counter first
190 * do a memory barrier and then clear the mask.
191 */
192 atomic_dec(&(vec)->count);
193 smp_mb__after_atomic_inc();
154 cpumask_clear_cpu(cpu, vec->mask); 194 cpumask_clear_cpu(cpu, vec->mask);
155
156 raw_spin_unlock_irqrestore(&vec->lock, flags);
157 } 195 }
158 196
159 *currpri = newpri; 197 *currpri = newpri;
@@ -175,8 +213,7 @@ int cpupri_init(struct cpupri *cp)
175 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 213 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
176 struct cpupri_vec *vec = &cp->pri_to_cpu[i]; 214 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
177 215
178 raw_spin_lock_init(&vec->lock); 216 atomic_set(&vec->count, 0);
179 vec->count = 0;
180 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) 217 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
181 goto cleanup; 218 goto cleanup;
182 } 219 }
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 9fc7d386fea4..f6d756173491 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -4,7 +4,6 @@
4#include <linux/sched.h> 4#include <linux/sched.h>
5 5
6#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) 6#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
7#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
8 7
9#define CPUPRI_INVALID -1 8#define CPUPRI_INVALID -1
10#define CPUPRI_IDLE 0 9#define CPUPRI_IDLE 0
@@ -12,14 +11,12 @@
12/* values 2-101 are RT priorities 0-99 */ 11/* values 2-101 are RT priorities 0-99 */
13 12
14struct cpupri_vec { 13struct cpupri_vec {
15 raw_spinlock_t lock; 14 atomic_t count;
16 int count; 15 cpumask_var_t mask;
17 cpumask_var_t mask;
18}; 16};
19 17
20struct cpupri { 18struct cpupri {
21 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; 19 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
22 long pri_active[CPUPRI_NR_PRI_WORDS];
23 int cpu_to_pri[NR_CPUS]; 20 int cpu_to_pri[NR_CPUS];
24}; 21};
25 22
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bc8ee9993814..fef0bfde7c8c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
89 */ 89 */
90unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; 90unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
91 91
92#ifdef CONFIG_CFS_BANDWIDTH
93/*
94 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
95 * each time a cfs_rq requests quota.
96 *
97 * Note: in the case that the slice exceeds the runtime remaining (either due
98 * to consumption or the quota being specified to be smaller than the slice)
99 * we will always only issue the remaining available time.
100 *
101 * default: 5 msec, units: microseconds
102 */
103unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
104#endif
105
92static const struct sched_class fair_sched_class; 106static const struct sched_class fair_sched_class;
93 107
94/************************************************************** 108/**************************************************************
@@ -292,6 +306,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
292 306
293#endif /* CONFIG_FAIR_GROUP_SCHED */ 307#endif /* CONFIG_FAIR_GROUP_SCHED */
294 308
309static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
310 unsigned long delta_exec);
295 311
296/************************************************************** 312/**************************************************************
297 * Scheduling class tree data structure manipulation methods: 313 * Scheduling class tree data structure manipulation methods:
@@ -583,6 +599,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
583 cpuacct_charge(curtask, delta_exec); 599 cpuacct_charge(curtask, delta_exec);
584 account_group_exec_runtime(curtask, delta_exec); 600 account_group_exec_runtime(curtask, delta_exec);
585 } 601 }
602
603 account_cfs_rq_runtime(cfs_rq, delta_exec);
586} 604}
587 605
588static inline void 606static inline void
@@ -688,6 +706,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
688} 706}
689 707
690#ifdef CONFIG_FAIR_GROUP_SCHED 708#ifdef CONFIG_FAIR_GROUP_SCHED
709/* we need this in update_cfs_load and load-balance functions below */
710static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
691# ifdef CONFIG_SMP 711# ifdef CONFIG_SMP
692static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, 712static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
693 int global_update) 713 int global_update)
@@ -710,7 +730,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
710 u64 now, delta; 730 u64 now, delta;
711 unsigned long load = cfs_rq->load.weight; 731 unsigned long load = cfs_rq->load.weight;
712 732
713 if (cfs_rq->tg == &root_task_group) 733 if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
714 return; 734 return;
715 735
716 now = rq_of(cfs_rq)->clock_task; 736 now = rq_of(cfs_rq)->clock_task;
@@ -819,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
819 839
820 tg = cfs_rq->tg; 840 tg = cfs_rq->tg;
821 se = tg->se[cpu_of(rq_of(cfs_rq))]; 841 se = tg->se[cpu_of(rq_of(cfs_rq))];
822 if (!se) 842 if (!se || throttled_hierarchy(cfs_rq))
823 return; 843 return;
824#ifndef CONFIG_SMP 844#ifndef CONFIG_SMP
825 if (likely(se->load.weight == tg->shares)) 845 if (likely(se->load.weight == tg->shares))
@@ -950,6 +970,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
950 se->vruntime = vruntime; 970 se->vruntime = vruntime;
951} 971}
952 972
973static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
974
953static void 975static void
954enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 976enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
955{ 977{
@@ -979,8 +1001,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
979 __enqueue_entity(cfs_rq, se); 1001 __enqueue_entity(cfs_rq, se);
980 se->on_rq = 1; 1002 se->on_rq = 1;
981 1003
982 if (cfs_rq->nr_running == 1) 1004 if (cfs_rq->nr_running == 1) {
983 list_add_leaf_cfs_rq(cfs_rq); 1005 list_add_leaf_cfs_rq(cfs_rq);
1006 check_enqueue_throttle(cfs_rq);
1007 }
984} 1008}
985 1009
986static void __clear_buddies_last(struct sched_entity *se) 1010static void __clear_buddies_last(struct sched_entity *se)
@@ -1028,6 +1052,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1028 __clear_buddies_skip(se); 1052 __clear_buddies_skip(se);
1029} 1053}
1030 1054
1055static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1056
1031static void 1057static void
1032dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 1058dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1033{ 1059{
@@ -1066,6 +1092,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1066 if (!(flags & DEQUEUE_SLEEP)) 1092 if (!(flags & DEQUEUE_SLEEP))
1067 se->vruntime -= cfs_rq->min_vruntime; 1093 se->vruntime -= cfs_rq->min_vruntime;
1068 1094
1095 /* return excess runtime on last dequeue */
1096 return_cfs_rq_runtime(cfs_rq);
1097
1069 update_min_vruntime(cfs_rq); 1098 update_min_vruntime(cfs_rq);
1070 update_cfs_shares(cfs_rq); 1099 update_cfs_shares(cfs_rq);
1071} 1100}
@@ -1077,6 +1106,8 @@ static void
1077check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 1106check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1078{ 1107{
1079 unsigned long ideal_runtime, delta_exec; 1108 unsigned long ideal_runtime, delta_exec;
1109 struct sched_entity *se;
1110 s64 delta;
1080 1111
1081 ideal_runtime = sched_slice(cfs_rq, curr); 1112 ideal_runtime = sched_slice(cfs_rq, curr);
1082 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 1113 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
@@ -1095,22 +1126,17 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1095 * narrow margin doesn't have to wait for a full slice. 1126 * narrow margin doesn't have to wait for a full slice.
1096 * This also mitigates buddy induced latencies under load. 1127 * This also mitigates buddy induced latencies under load.
1097 */ 1128 */
1098 if (!sched_feat(WAKEUP_PREEMPT))
1099 return;
1100
1101 if (delta_exec < sysctl_sched_min_granularity) 1129 if (delta_exec < sysctl_sched_min_granularity)
1102 return; 1130 return;
1103 1131
1104 if (cfs_rq->nr_running > 1) { 1132 se = __pick_first_entity(cfs_rq);
1105 struct sched_entity *se = __pick_first_entity(cfs_rq); 1133 delta = curr->vruntime - se->vruntime;
1106 s64 delta = curr->vruntime - se->vruntime;
1107 1134
1108 if (delta < 0) 1135 if (delta < 0)
1109 return; 1136 return;
1110 1137
1111 if (delta > ideal_runtime) 1138 if (delta > ideal_runtime)
1112 resched_task(rq_of(cfs_rq)->curr); 1139 resched_task(rq_of(cfs_rq)->curr);
1113 }
1114} 1140}
1115 1141
1116static void 1142static void
@@ -1185,6 +1211,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1185 return se; 1211 return se;
1186} 1212}
1187 1213
1214static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1215
1188static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) 1216static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1189{ 1217{
1190 /* 1218 /*
@@ -1194,6 +1222,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1194 if (prev->on_rq) 1222 if (prev->on_rq)
1195 update_curr(cfs_rq); 1223 update_curr(cfs_rq);
1196 1224
1225 /* throttle cfs_rqs exceeding runtime */
1226 check_cfs_rq_runtime(cfs_rq);
1227
1197 check_spread(cfs_rq, prev); 1228 check_spread(cfs_rq, prev);
1198 if (prev->on_rq) { 1229 if (prev->on_rq) {
1199 update_stats_wait_start(cfs_rq, prev); 1230 update_stats_wait_start(cfs_rq, prev);
@@ -1233,10 +1264,583 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1233 return; 1264 return;
1234#endif 1265#endif
1235 1266
1236 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) 1267 if (cfs_rq->nr_running > 1)
1237 check_preempt_tick(cfs_rq, curr); 1268 check_preempt_tick(cfs_rq, curr);
1238} 1269}
1239 1270
1271
1272/**************************************************
1273 * CFS bandwidth control machinery
1274 */
1275
1276#ifdef CONFIG_CFS_BANDWIDTH
1277/*
1278 * default period for cfs group bandwidth.
1279 * default: 0.1s, units: nanoseconds
1280 */
1281static inline u64 default_cfs_period(void)
1282{
1283 return 100000000ULL;
1284}
1285
1286static inline u64 sched_cfs_bandwidth_slice(void)
1287{
1288 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
1289}
1290
1291/*
1292 * Replenish runtime according to assigned quota and update expiration time.
1293 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
1294 * additional synchronization around rq->lock.
1295 *
1296 * requires cfs_b->lock
1297 */
1298static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1299{
1300 u64 now;
1301
1302 if (cfs_b->quota == RUNTIME_INF)
1303 return;
1304
1305 now = sched_clock_cpu(smp_processor_id());
1306 cfs_b->runtime = cfs_b->quota;
1307 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1308}
1309
1310/* returns 0 on failure to allocate runtime */
1311static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1312{
1313 struct task_group *tg = cfs_rq->tg;
1314 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
1315 u64 amount = 0, min_amount, expires;
1316
1317 /* note: this is a positive sum as runtime_remaining <= 0 */
1318 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
1319
1320 raw_spin_lock(&cfs_b->lock);
1321 if (cfs_b->quota == RUNTIME_INF)
1322 amount = min_amount;
1323 else {
1324 /*
1325 * If the bandwidth pool has become inactive, then at least one
1326 * period must have elapsed since the last consumption.
1327 * Refresh the global state and ensure bandwidth timer becomes
1328 * active.
1329 */
1330 if (!cfs_b->timer_active) {
1331 __refill_cfs_bandwidth_runtime(cfs_b);
1332 __start_cfs_bandwidth(cfs_b);
1333 }
1334
1335 if (cfs_b->runtime > 0) {
1336 amount = min(cfs_b->runtime, min_amount);
1337 cfs_b->runtime -= amount;
1338 cfs_b->idle = 0;
1339 }
1340 }
1341 expires = cfs_b->runtime_expires;
1342 raw_spin_unlock(&cfs_b->lock);
1343
1344 cfs_rq->runtime_remaining += amount;
1345 /*
1346 * we may have advanced our local expiration to account for allowed
1347 * spread between our sched_clock and the one on which runtime was
1348 * issued.
1349 */
1350 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
1351 cfs_rq->runtime_expires = expires;
1352
1353 return cfs_rq->runtime_remaining > 0;
1354}
1355
1356/*
1357 * Note: This depends on the synchronization provided by sched_clock and the
1358 * fact that rq->clock snapshots this value.
1359 */
1360static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1361{
1362 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1363 struct rq *rq = rq_of(cfs_rq);
1364
1365 /* if the deadline is ahead of our clock, nothing to do */
1366 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
1367 return;
1368
1369 if (cfs_rq->runtime_remaining < 0)
1370 return;
1371
1372 /*
1373 * If the local deadline has passed we have to consider the
1374 * possibility that our sched_clock is 'fast' and the global deadline
1375 * has not truly expired.
1376 *
1377 * Fortunately we can check determine whether this the case by checking
1378 * whether the global deadline has advanced.
1379 */
1380
1381 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
1382 /* extend local deadline, drift is bounded above by 2 ticks */
1383 cfs_rq->runtime_expires += TICK_NSEC;
1384 } else {
1385 /* global deadline is ahead, expiration has passed */
1386 cfs_rq->runtime_remaining = 0;
1387 }
1388}
1389
1390static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1391 unsigned long delta_exec)
1392{
1393 /* dock delta_exec before expiring quota (as it could span periods) */
1394 cfs_rq->runtime_remaining -= delta_exec;
1395 expire_cfs_rq_runtime(cfs_rq);
1396
1397 if (likely(cfs_rq->runtime_remaining > 0))
1398 return;
1399
1400 /*
1401 * if we're unable to extend our runtime we resched so that the active
1402 * hierarchy can be throttled
1403 */
1404 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
1405 resched_task(rq_of(cfs_rq)->curr);
1406}
1407
1408static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1409 unsigned long delta_exec)
1410{
1411 if (!cfs_rq->runtime_enabled)
1412 return;
1413
1414 __account_cfs_rq_runtime(cfs_rq, delta_exec);
1415}
1416
1417static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1418{
1419 return cfs_rq->throttled;
1420}
1421
1422/* check whether cfs_rq, or any parent, is throttled */
1423static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1424{
1425 return cfs_rq->throttle_count;
1426}
1427
1428/*
1429 * Ensure that neither of the group entities corresponding to src_cpu or
1430 * dest_cpu are members of a throttled hierarchy when performing group
1431 * load-balance operations.
1432 */
1433static inline int throttled_lb_pair(struct task_group *tg,
1434 int src_cpu, int dest_cpu)
1435{
1436 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
1437
1438 src_cfs_rq = tg->cfs_rq[src_cpu];
1439 dest_cfs_rq = tg->cfs_rq[dest_cpu];
1440
1441 return throttled_hierarchy(src_cfs_rq) ||
1442 throttled_hierarchy(dest_cfs_rq);
1443}
1444
1445/* updated child weight may affect parent so we have to do this bottom up */
1446static int tg_unthrottle_up(struct task_group *tg, void *data)
1447{
1448 struct rq *rq = data;
1449 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1450
1451 cfs_rq->throttle_count--;
1452#ifdef CONFIG_SMP
1453 if (!cfs_rq->throttle_count) {
1454 u64 delta = rq->clock_task - cfs_rq->load_stamp;
1455
1456 /* leaving throttled state, advance shares averaging windows */
1457 cfs_rq->load_stamp += delta;
1458 cfs_rq->load_last += delta;
1459
1460 /* update entity weight now that we are on_rq again */
1461 update_cfs_shares(cfs_rq);
1462 }
1463#endif
1464
1465 return 0;
1466}
1467
1468static int tg_throttle_down(struct task_group *tg, void *data)
1469{
1470 struct rq *rq = data;
1471 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
1472
1473 /* group is entering throttled state, record last load */
1474 if (!cfs_rq->throttle_count)
1475 update_cfs_load(cfs_rq, 0);
1476 cfs_rq->throttle_count++;
1477
1478 return 0;
1479}
1480
1481static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1482{
1483 struct rq *rq = rq_of(cfs_rq);
1484 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1485 struct sched_entity *se;
1486 long task_delta, dequeue = 1;
1487
1488 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1489
1490 /* account load preceding throttle */
1491 rcu_read_lock();
1492 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
1493 rcu_read_unlock();
1494
1495 task_delta = cfs_rq->h_nr_running;
1496 for_each_sched_entity(se) {
1497 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
1498 /* throttled entity or throttle-on-deactivate */
1499 if (!se->on_rq)
1500 break;
1501
1502 if (dequeue)
1503 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
1504 qcfs_rq->h_nr_running -= task_delta;
1505
1506 if (qcfs_rq->load.weight)
1507 dequeue = 0;
1508 }
1509
1510 if (!se)
1511 rq->nr_running -= task_delta;
1512
1513 cfs_rq->throttled = 1;
1514 cfs_rq->throttled_timestamp = rq->clock;
1515 raw_spin_lock(&cfs_b->lock);
1516 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
1517 raw_spin_unlock(&cfs_b->lock);
1518}
1519
1520static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1521{
1522 struct rq *rq = rq_of(cfs_rq);
1523 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1524 struct sched_entity *se;
1525 int enqueue = 1;
1526 long task_delta;
1527
1528 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
1529
1530 cfs_rq->throttled = 0;
1531 raw_spin_lock(&cfs_b->lock);
1532 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
1533 list_del_rcu(&cfs_rq->throttled_list);
1534 raw_spin_unlock(&cfs_b->lock);
1535 cfs_rq->throttled_timestamp = 0;
1536
1537 update_rq_clock(rq);
1538 /* update hierarchical throttle state */
1539 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
1540
1541 if (!cfs_rq->load.weight)
1542 return;
1543
1544 task_delta = cfs_rq->h_nr_running;
1545 for_each_sched_entity(se) {
1546 if (se->on_rq)
1547 enqueue = 0;
1548
1549 cfs_rq = cfs_rq_of(se);
1550 if (enqueue)
1551 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
1552 cfs_rq->h_nr_running += task_delta;
1553
1554 if (cfs_rq_throttled(cfs_rq))
1555 break;
1556 }
1557
1558 if (!se)
1559 rq->nr_running += task_delta;
1560
1561 /* determine whether we need to wake up potentially idle cpu */
1562 if (rq->curr == rq->idle && rq->cfs.nr_running)
1563 resched_task(rq->curr);
1564}
1565
1566static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
1567 u64 remaining, u64 expires)
1568{
1569 struct cfs_rq *cfs_rq;
1570 u64 runtime = remaining;
1571
1572 rcu_read_lock();
1573 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
1574 throttled_list) {
1575 struct rq *rq = rq_of(cfs_rq);
1576
1577 raw_spin_lock(&rq->lock);
1578 if (!cfs_rq_throttled(cfs_rq))
1579 goto next;
1580
1581 runtime = -cfs_rq->runtime_remaining + 1;
1582 if (runtime > remaining)
1583 runtime = remaining;
1584 remaining -= runtime;
1585
1586 cfs_rq->runtime_remaining += runtime;
1587 cfs_rq->runtime_expires = expires;
1588
1589 /* we check whether we're throttled above */
1590 if (cfs_rq->runtime_remaining > 0)
1591 unthrottle_cfs_rq(cfs_rq);
1592
1593next:
1594 raw_spin_unlock(&rq->lock);
1595
1596 if (!remaining)
1597 break;
1598 }
1599 rcu_read_unlock();
1600
1601 return remaining;
1602}
1603
1604/*
1605 * Responsible for refilling a task_group's bandwidth and unthrottling its
1606 * cfs_rqs as appropriate. If there has been no activity within the last
1607 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
1608 * used to track this state.
1609 */
1610static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
1611{
1612 u64 runtime, runtime_expires;
1613 int idle = 1, throttled;
1614
1615 raw_spin_lock(&cfs_b->lock);
1616 /* no need to continue the timer with no bandwidth constraint */
1617 if (cfs_b->quota == RUNTIME_INF)
1618 goto out_unlock;
1619
1620 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1621 /* idle depends on !throttled (for the case of a large deficit) */
1622 idle = cfs_b->idle && !throttled;
1623 cfs_b->nr_periods += overrun;
1624
1625 /* if we're going inactive then everything else can be deferred */
1626 if (idle)
1627 goto out_unlock;
1628
1629 __refill_cfs_bandwidth_runtime(cfs_b);
1630
1631 if (!throttled) {
1632 /* mark as potentially idle for the upcoming period */
1633 cfs_b->idle = 1;
1634 goto out_unlock;
1635 }
1636
1637 /* account preceding periods in which throttling occurred */
1638 cfs_b->nr_throttled += overrun;
1639
1640 /*
1641 * There are throttled entities so we must first use the new bandwidth
1642 * to unthrottle them before making it generally available. This
1643 * ensures that all existing debts will be paid before a new cfs_rq is
1644 * allowed to run.
1645 */
1646 runtime = cfs_b->runtime;
1647 runtime_expires = cfs_b->runtime_expires;
1648 cfs_b->runtime = 0;
1649
1650 /*
1651 * This check is repeated as we are holding onto the new bandwidth
1652 * while we unthrottle. This can potentially race with an unthrottled
1653 * group trying to acquire new bandwidth from the global pool.
1654 */
1655 while (throttled && runtime > 0) {
1656 raw_spin_unlock(&cfs_b->lock);
1657 /* we can't nest cfs_b->lock while distributing bandwidth */
1658 runtime = distribute_cfs_runtime(cfs_b, runtime,
1659 runtime_expires);
1660 raw_spin_lock(&cfs_b->lock);
1661
1662 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
1663 }
1664
1665 /* return (any) remaining runtime */
1666 cfs_b->runtime = runtime;
1667 /*
1668 * While we are ensured activity in the period following an
1669 * unthrottle, this also covers the case in which the new bandwidth is
1670 * insufficient to cover the existing bandwidth deficit. (Forcing the
1671 * timer to remain active while there are any throttled entities.)
1672 */
1673 cfs_b->idle = 0;
1674out_unlock:
1675 if (idle)
1676 cfs_b->timer_active = 0;
1677 raw_spin_unlock(&cfs_b->lock);
1678
1679 return idle;
1680}
1681
1682/* a cfs_rq won't donate quota below this amount */
1683static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
1684/* minimum remaining period time to redistribute slack quota */
1685static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
1686/* how long we wait to gather additional slack before distributing */
1687static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
1688
1689/* are we near the end of the current quota period? */
1690static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
1691{
1692 struct hrtimer *refresh_timer = &cfs_b->period_timer;
1693 u64 remaining;
1694
1695 /* if the call-back is running a quota refresh is already occurring */
1696 if (hrtimer_callback_running(refresh_timer))
1697 return 1;
1698
1699 /* is a quota refresh about to occur? */
1700 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
1701 if (remaining < min_expire)
1702 return 1;
1703
1704 return 0;
1705}
1706
1707static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
1708{
1709 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
1710
1711 /* if there's a quota refresh soon don't bother with slack */
1712 if (runtime_refresh_within(cfs_b, min_left))
1713 return;
1714
1715 start_bandwidth_timer(&cfs_b->slack_timer,
1716 ns_to_ktime(cfs_bandwidth_slack_period));
1717}
1718
1719/* we know any runtime found here is valid as update_curr() precedes return */
1720static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1721{
1722 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1723 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
1724
1725 if (slack_runtime <= 0)
1726 return;
1727
1728 raw_spin_lock(&cfs_b->lock);
1729 if (cfs_b->quota != RUNTIME_INF &&
1730 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
1731 cfs_b->runtime += slack_runtime;
1732
1733 /* we are under rq->lock, defer unthrottling using a timer */
1734 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
1735 !list_empty(&cfs_b->throttled_cfs_rq))
1736 start_cfs_slack_bandwidth(cfs_b);
1737 }
1738 raw_spin_unlock(&cfs_b->lock);
1739
1740 /* even if it's not valid for return we don't want to try again */
1741 cfs_rq->runtime_remaining -= slack_runtime;
1742}
1743
1744static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1745{
1746 if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
1747 return;
1748
1749 __return_cfs_rq_runtime(cfs_rq);
1750}
1751
1752/*
1753 * This is done with a timer (instead of inline with bandwidth return) since
1754 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
1755 */
1756static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
1757{
1758 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
1759 u64 expires;
1760
1761 /* confirm we're still not at a refresh boundary */
1762 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
1763 return;
1764
1765 raw_spin_lock(&cfs_b->lock);
1766 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
1767 runtime = cfs_b->runtime;
1768 cfs_b->runtime = 0;
1769 }
1770 expires = cfs_b->runtime_expires;
1771 raw_spin_unlock(&cfs_b->lock);
1772
1773 if (!runtime)
1774 return;
1775
1776 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
1777
1778 raw_spin_lock(&cfs_b->lock);
1779 if (expires == cfs_b->runtime_expires)
1780 cfs_b->runtime = runtime;
1781 raw_spin_unlock(&cfs_b->lock);
1782}
1783
1784/*
1785 * When a group wakes up we want to make sure that its quota is not already
1786 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
1787 * runtime as update_curr() throttling can not not trigger until it's on-rq.
1788 */
1789static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1790{
1791 /* an active group must be handled by the update_curr()->put() path */
1792 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
1793 return;
1794
1795 /* ensure the group is not already throttled */
1796 if (cfs_rq_throttled(cfs_rq))
1797 return;
1798
1799 /* update runtime allocation */
1800 account_cfs_rq_runtime(cfs_rq, 0);
1801 if (cfs_rq->runtime_remaining <= 0)
1802 throttle_cfs_rq(cfs_rq);
1803}
1804
1805/* conditionally throttle active cfs_rq's from put_prev_entity() */
1806static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1807{
1808 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
1809 return;
1810
1811 /*
1812 * it's possible for a throttled entity to be forced into a running
1813 * state (e.g. set_curr_task), in this case we're finished.
1814 */
1815 if (cfs_rq_throttled(cfs_rq))
1816 return;
1817
1818 throttle_cfs_rq(cfs_rq);
1819}
1820#else
1821static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1822 unsigned long delta_exec) {}
1823static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1824static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
1825static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1826
1827static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1828{
1829 return 0;
1830}
1831
1832static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1833{
1834 return 0;
1835}
1836
1837static inline int throttled_lb_pair(struct task_group *tg,
1838 int src_cpu, int dest_cpu)
1839{
1840 return 0;
1841}
1842#endif
1843
1240/************************************************** 1844/**************************************************
1241 * CFS operations on tasks: 1845 * CFS operations on tasks:
1242 */ 1846 */
@@ -1313,16 +1917,33 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1313 break; 1917 break;
1314 cfs_rq = cfs_rq_of(se); 1918 cfs_rq = cfs_rq_of(se);
1315 enqueue_entity(cfs_rq, se, flags); 1919 enqueue_entity(cfs_rq, se, flags);
1920
1921 /*
1922 * end evaluation on encountering a throttled cfs_rq
1923 *
1924 * note: in the case of encountering a throttled cfs_rq we will
1925 * post the final h_nr_running increment below.
1926 */
1927 if (cfs_rq_throttled(cfs_rq))
1928 break;
1929 cfs_rq->h_nr_running++;
1930
1316 flags = ENQUEUE_WAKEUP; 1931 flags = ENQUEUE_WAKEUP;
1317 } 1932 }
1318 1933
1319 for_each_sched_entity(se) { 1934 for_each_sched_entity(se) {
1320 cfs_rq = cfs_rq_of(se); 1935 cfs_rq = cfs_rq_of(se);
1936 cfs_rq->h_nr_running++;
1937
1938 if (cfs_rq_throttled(cfs_rq))
1939 break;
1321 1940
1322 update_cfs_load(cfs_rq, 0); 1941 update_cfs_load(cfs_rq, 0);
1323 update_cfs_shares(cfs_rq); 1942 update_cfs_shares(cfs_rq);
1324 } 1943 }
1325 1944
1945 if (!se)
1946 inc_nr_running(rq);
1326 hrtick_update(rq); 1947 hrtick_update(rq);
1327} 1948}
1328 1949
@@ -1343,6 +1964,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1343 cfs_rq = cfs_rq_of(se); 1964 cfs_rq = cfs_rq_of(se);
1344 dequeue_entity(cfs_rq, se, flags); 1965 dequeue_entity(cfs_rq, se, flags);
1345 1966
1967 /*
1968 * end evaluation on encountering a throttled cfs_rq
1969 *
1970 * note: in the case of encountering a throttled cfs_rq we will
1971 * post the final h_nr_running decrement below.
1972 */
1973 if (cfs_rq_throttled(cfs_rq))
1974 break;
1975 cfs_rq->h_nr_running--;
1976
1346 /* Don't dequeue parent if it has other entities besides us */ 1977 /* Don't dequeue parent if it has other entities besides us */
1347 if (cfs_rq->load.weight) { 1978 if (cfs_rq->load.weight) {
1348 /* 1979 /*
@@ -1361,11 +1992,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1361 1992
1362 for_each_sched_entity(se) { 1993 for_each_sched_entity(se) {
1363 cfs_rq = cfs_rq_of(se); 1994 cfs_rq = cfs_rq_of(se);
1995 cfs_rq->h_nr_running--;
1996
1997 if (cfs_rq_throttled(cfs_rq))
1998 break;
1364 1999
1365 update_cfs_load(cfs_rq, 0); 2000 update_cfs_load(cfs_rq, 0);
1366 update_cfs_shares(cfs_rq); 2001 update_cfs_shares(cfs_rq);
1367 } 2002 }
1368 2003
2004 if (!se)
2005 dec_nr_running(rq);
1369 hrtick_update(rq); 2006 hrtick_update(rq);
1370} 2007}
1371 2008
@@ -1434,7 +2071,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1434 2071
1435 return wl; 2072 return wl;
1436} 2073}
1437
1438#else 2074#else
1439 2075
1440static inline unsigned long effective_load(struct task_group *tg, int cpu, 2076static inline unsigned long effective_load(struct task_group *tg, int cpu,
@@ -1875,6 +2511,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1875 if (unlikely(se == pse)) 2511 if (unlikely(se == pse))
1876 return; 2512 return;
1877 2513
2514 /*
2515 * This is possible from callers such as pull_task(), in which we
2516 * unconditionally check_prempt_curr() after an enqueue (which may have
2517 * lead to a throttle). This both saves work and prevents false
2518 * next-buddy nomination below.
2519 */
2520 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
2521 return;
2522
1878 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { 2523 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
1879 set_next_buddy(pse); 2524 set_next_buddy(pse);
1880 next_buddy_marked = 1; 2525 next_buddy_marked = 1;
@@ -1883,6 +2528,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1883 /* 2528 /*
1884 * We can come here with TIF_NEED_RESCHED already set from new task 2529 * We can come here with TIF_NEED_RESCHED already set from new task
1885 * wake up path. 2530 * wake up path.
2531 *
2532 * Note: this also catches the edge-case of curr being in a throttled
2533 * group (e.g. via set_curr_task), since update_curr() (in the
2534 * enqueue of curr) will have resulted in resched being set. This
2535 * prevents us from potentially nominating it as a false LAST_BUDDY
2536 * below.
1886 */ 2537 */
1887 if (test_tsk_need_resched(curr)) 2538 if (test_tsk_need_resched(curr))
1888 return; 2539 return;
@@ -1899,10 +2550,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1899 if (unlikely(p->policy != SCHED_NORMAL)) 2550 if (unlikely(p->policy != SCHED_NORMAL))
1900 return; 2551 return;
1901 2552
1902
1903 if (!sched_feat(WAKEUP_PREEMPT))
1904 return;
1905
1906 find_matching_se(&se, &pse); 2553 find_matching_se(&se, &pse);
1907 update_curr(cfs_rq_of(se)); 2554 update_curr(cfs_rq_of(se));
1908 BUG_ON(!pse); 2555 BUG_ON(!pse);
@@ -2005,7 +2652,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
2005{ 2652{
2006 struct sched_entity *se = &p->se; 2653 struct sched_entity *se = &p->se;
2007 2654
2008 if (!se->on_rq) 2655 /* throttled hierarchies are not runnable */
2656 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
2009 return false; 2657 return false;
2010 2658
2011 /* Tell the scheduler that we'd really like pse to run next. */ 2659 /* Tell the scheduler that we'd really like pse to run next. */
@@ -2102,6 +2750,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2102 2750
2103 for_each_leaf_cfs_rq(busiest, cfs_rq) { 2751 for_each_leaf_cfs_rq(busiest, cfs_rq) {
2104 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { 2752 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
2753 if (throttled_lb_pair(task_group(p),
2754 busiest->cpu, this_cpu))
2755 break;
2105 2756
2106 if (!can_migrate_task(p, busiest, this_cpu, 2757 if (!can_migrate_task(p, busiest, this_cpu,
2107 sd, idle, &pinned)) 2758 sd, idle, &pinned))
@@ -2217,8 +2868,13 @@ static void update_shares(int cpu)
2217 * Iterates the task_group tree in a bottom up fashion, see 2868 * Iterates the task_group tree in a bottom up fashion, see
2218 * list_add_leaf_cfs_rq() for details. 2869 * list_add_leaf_cfs_rq() for details.
2219 */ 2870 */
2220 for_each_leaf_cfs_rq(rq, cfs_rq) 2871 for_each_leaf_cfs_rq(rq, cfs_rq) {
2872 /* throttled entities do not contribute to load */
2873 if (throttled_hierarchy(cfs_rq))
2874 continue;
2875
2221 update_shares_cpu(cfs_rq->tg, cpu); 2876 update_shares_cpu(cfs_rq->tg, cpu);
2877 }
2222 rcu_read_unlock(); 2878 rcu_read_unlock();
2223} 2879}
2224 2880
@@ -2268,9 +2924,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2268 u64 rem_load, moved_load; 2924 u64 rem_load, moved_load;
2269 2925
2270 /* 2926 /*
2271 * empty group 2927 * empty group or part of a throttled hierarchy
2272 */ 2928 */
2273 if (!busiest_cfs_rq->task_weight) 2929 if (!busiest_cfs_rq->task_weight ||
2930 throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
2274 continue; 2931 continue;
2275 2932
2276 rem_load = (u64)rem_load_move * busiest_weight; 2933 rem_load = (u64)rem_load_move * busiest_weight;
@@ -3667,7 +4324,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3667 struct sched_domain *sd; 4324 struct sched_domain *sd;
3668 4325
3669 for_each_domain(cpu, sd) 4326 for_each_domain(cpu, sd)
3670 if (sd && (sd->flags & flag)) 4327 if (sd->flags & flag)
3671 break; 4328 break;
3672 4329
3673 return sd; 4330 return sd;
@@ -4251,8 +4908,13 @@ static void set_curr_task_fair(struct rq *rq)
4251{ 4908{
4252 struct sched_entity *se = &rq->curr->se; 4909 struct sched_entity *se = &rq->curr->se;
4253 4910
4254 for_each_sched_entity(se) 4911 for_each_sched_entity(se) {
4255 set_next_entity(cfs_rq_of(se), se); 4912 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4913
4914 set_next_entity(cfs_rq, se);
4915 /* ensure bandwidth has been allocated on our new cfs_rq */
4916 account_cfs_rq_runtime(cfs_rq, 0);
4917 }
4256} 4918}
4257 4919
4258#ifdef CONFIG_FAIR_GROUP_SCHED 4920#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 2e74677cb040..efa0a7b75dde 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -12,11 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
12SCHED_FEAT(START_DEBIT, 1) 12SCHED_FEAT(START_DEBIT, 1)
13 13
14/* 14/*
15 * Should wakeups try to preempt running tasks.
16 */
17SCHED_FEAT(WAKEUP_PREEMPT, 1)
18
19/*
20 * Based on load and program behaviour, see if it makes sense to place 15 * Based on load and program behaviour, see if it makes sense to place
21 * a newly woken task on the same cpu as the task that woke it -- 16 * a newly woken task on the same cpu as the task that woke it --
22 * improve cache locality. Typically used with SYNC wakeups as 17 * improve cache locality. Typically used with SYNC wakeups as
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index af1177858be3..0cc188cf7664 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -124,21 +124,33 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
124 update_rt_migration(rt_rq); 124 update_rt_migration(rt_rq);
125} 125}
126 126
127static inline int has_pushable_tasks(struct rq *rq)
128{
129 return !plist_head_empty(&rq->rt.pushable_tasks);
130}
131
127static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 132static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
128{ 133{
129 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 134 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
130 plist_node_init(&p->pushable_tasks, p->prio); 135 plist_node_init(&p->pushable_tasks, p->prio);
131 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); 136 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
137
138 /* Update the highest prio pushable task */
139 if (p->prio < rq->rt.highest_prio.next)
140 rq->rt.highest_prio.next = p->prio;
132} 141}
133 142
134static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 143static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
135{ 144{
136 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 145 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
137}
138 146
139static inline int has_pushable_tasks(struct rq *rq) 147 /* Update the new highest prio pushable task */
140{ 148 if (has_pushable_tasks(rq)) {
141 return !plist_head_empty(&rq->rt.pushable_tasks); 149 p = plist_first_entry(&rq->rt.pushable_tasks,
150 struct task_struct, pushable_tasks);
151 rq->rt.highest_prio.next = p->prio;
152 } else
153 rq->rt.highest_prio.next = MAX_RT_PRIO;
142} 154}
143 155
144#else 156#else
@@ -698,47 +710,13 @@ static void update_curr_rt(struct rq *rq)
698 710
699#if defined CONFIG_SMP 711#if defined CONFIG_SMP
700 712
701static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
702
703static inline int next_prio(struct rq *rq)
704{
705 struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
706
707 if (next && rt_prio(next->prio))
708 return next->prio;
709 else
710 return MAX_RT_PRIO;
711}
712
713static void 713static void
714inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 714inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
715{ 715{
716 struct rq *rq = rq_of_rt_rq(rt_rq); 716 struct rq *rq = rq_of_rt_rq(rt_rq);
717 717
718 if (prio < prev_prio) { 718 if (rq->online && prio < prev_prio)
719 719 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
720 /*
721 * If the new task is higher in priority than anything on the
722 * run-queue, we know that the previous high becomes our
723 * next-highest.
724 */
725 rt_rq->highest_prio.next = prev_prio;
726
727 if (rq->online)
728 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
729
730 } else if (prio == rt_rq->highest_prio.curr)
731 /*
732 * If the next task is equal in priority to the highest on
733 * the run-queue, then we implicitly know that the next highest
734 * task cannot be any lower than current
735 */
736 rt_rq->highest_prio.next = prio;
737 else if (prio < rt_rq->highest_prio.next)
738 /*
739 * Otherwise, we need to recompute next-highest
740 */
741 rt_rq->highest_prio.next = next_prio(rq);
742} 720}
743 721
744static void 722static void
@@ -746,9 +724,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
746{ 724{
747 struct rq *rq = rq_of_rt_rq(rt_rq); 725 struct rq *rq = rq_of_rt_rq(rt_rq);
748 726
749 if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
750 rt_rq->highest_prio.next = next_prio(rq);
751
752 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 727 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
753 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 728 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
754} 729}
@@ -961,6 +936,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
961 936
962 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 937 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
963 enqueue_pushable_task(rq, p); 938 enqueue_pushable_task(rq, p);
939
940 inc_nr_running(rq);
964} 941}
965 942
966static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 943static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -971,6 +948,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
971 dequeue_rt_entity(rt_se); 948 dequeue_rt_entity(rt_se);
972 949
973 dequeue_pushable_task(rq, p); 950 dequeue_pushable_task(rq, p);
951
952 dec_nr_running(rq);
974} 953}
975 954
976/* 955/*
@@ -1017,10 +996,12 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1017 struct rq *rq; 996 struct rq *rq;
1018 int cpu; 997 int cpu;
1019 998
1020 if (sd_flag != SD_BALANCE_WAKE)
1021 return smp_processor_id();
1022
1023 cpu = task_cpu(p); 999 cpu = task_cpu(p);
1000
1001 /* For anything but wake ups, just return the task_cpu */
1002 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1003 goto out;
1004
1024 rq = cpu_rq(cpu); 1005 rq = cpu_rq(cpu);
1025 1006
1026 rcu_read_lock(); 1007 rcu_read_lock();
@@ -1059,6 +1040,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1059 } 1040 }
1060 rcu_read_unlock(); 1041 rcu_read_unlock();
1061 1042
1043out:
1062 return cpu; 1044 return cpu;
1063} 1045}
1064 1046
@@ -1178,7 +1160,6 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
1178static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 1160static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1179{ 1161{
1180 update_curr_rt(rq); 1162 update_curr_rt(rq);
1181 p->se.exec_start = 0;
1182 1163
1183 /* 1164 /*
1184 * The previous task needs to be made eligible for pushing 1165 * The previous task needs to be made eligible for pushing
@@ -1394,6 +1375,7 @@ static int push_rt_task(struct rq *rq)
1394{ 1375{
1395 struct task_struct *next_task; 1376 struct task_struct *next_task;
1396 struct rq *lowest_rq; 1377 struct rq *lowest_rq;
1378 int ret = 0;
1397 1379
1398 if (!rq->rt.overloaded) 1380 if (!rq->rt.overloaded)
1399 return 0; 1381 return 0;
@@ -1426,7 +1408,7 @@ retry:
1426 if (!lowest_rq) { 1408 if (!lowest_rq) {
1427 struct task_struct *task; 1409 struct task_struct *task;
1428 /* 1410 /*
1429 * find lock_lowest_rq releases rq->lock 1411 * find_lock_lowest_rq releases rq->lock
1430 * so it is possible that next_task has migrated. 1412 * so it is possible that next_task has migrated.
1431 * 1413 *
1432 * We need to make sure that the task is still on the same 1414 * We need to make sure that the task is still on the same
@@ -1436,12 +1418,11 @@ retry:
1436 task = pick_next_pushable_task(rq); 1418 task = pick_next_pushable_task(rq);
1437 if (task_cpu(next_task) == rq->cpu && task == next_task) { 1419 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1438 /* 1420 /*
1439 * If we get here, the task hasn't moved at all, but 1421 * The task hasn't migrated, and is still the next
1440 * it has failed to push. We will not try again, 1422 * eligible task, but we failed to find a run-queue
1441 * since the other cpus will pull from us when they 1423 * to push it to. Do not retry in this case, since
1442 * are ready. 1424 * other cpus will pull from us when ready.
1443 */ 1425 */
1444 dequeue_pushable_task(rq, next_task);
1445 goto out; 1426 goto out;
1446 } 1427 }
1447 1428
@@ -1460,6 +1441,7 @@ retry:
1460 deactivate_task(rq, next_task, 0); 1441 deactivate_task(rq, next_task, 0);
1461 set_task_cpu(next_task, lowest_rq->cpu); 1442 set_task_cpu(next_task, lowest_rq->cpu);
1462 activate_task(lowest_rq, next_task, 0); 1443 activate_task(lowest_rq, next_task, 0);
1444 ret = 1;
1463 1445
1464 resched_task(lowest_rq->curr); 1446 resched_task(lowest_rq->curr);
1465 1447
@@ -1468,7 +1450,7 @@ retry:
1468out: 1450out:
1469 put_task_struct(next_task); 1451 put_task_struct(next_task);
1470 1452
1471 return 1; 1453 return ret;
1472} 1454}
1473 1455
1474static void push_rt_tasks(struct rq *rq) 1456static void push_rt_tasks(struct rq *rq)
@@ -1863,4 +1845,3 @@ static void print_rt_stats(struct seq_file *m, int cpu)
1863 rcu_read_unlock(); 1845 rcu_read_unlock();
1864} 1846}
1865#endif /* CONFIG_SCHED_DEBUG */ 1847#endif /* CONFIG_SCHED_DEBUG */
1866
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 6f437632afab..8b44e7fa7fb3 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -34,11 +34,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
34static void 34static void
35enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) 35enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
36{ 36{
37 inc_nr_running(rq);
37} 38}
38 39
39static void 40static void
40dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) 41dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
41{ 42{
43 dec_nr_running(rq);
42} 44}
43 45
44static void yield_task_stop(struct rq *rq) 46static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 11d65b531e50..2d2ecdcc8cdb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -379,6 +379,16 @@ static struct ctl_table kern_table[] = {
379 .extra2 = &one, 379 .extra2 = &one,
380 }, 380 },
381#endif 381#endif
382#ifdef CONFIG_CFS_BANDWIDTH
383 {
384 .procname = "sched_cfs_bandwidth_slice_us",
385 .data = &sysctl_sched_cfs_bandwidth_slice,
386 .maxlen = sizeof(unsigned int),
387 .mode = 0644,
388 .proc_handler = proc_dointvec_minmax,
389 .extra1 = &one,
390 },
391#endif
382#ifdef CONFIG_PROVE_LOCKING 392#ifdef CONFIG_PROVE_LOCKING
383 { 393 {
384 .procname = "prove_locking", 394 .procname = "prove_locking",