diff options
-rw-r--r-- | Documentation/scheduler/sched-bwc.txt | 122 | ||||
-rw-r--r-- | include/linux/sched.h | 4 | ||||
-rw-r--r-- | include/trace/events/sched.h | 9 | ||||
-rw-r--r-- | init/Kconfig | 12 | ||||
-rw-r--r-- | kernel/sched.c | 559 | ||||
-rw-r--r-- | kernel/sched_cpupri.c | 89 | ||||
-rw-r--r-- | kernel/sched_cpupri.h | 7 | ||||
-rw-r--r-- | kernel/sched_fair.c | 716 | ||||
-rw-r--r-- | kernel/sched_features.h | 5 | ||||
-rw-r--r-- | kernel/sched_rt.c | 91 | ||||
-rw-r--r-- | kernel/sched_stoptask.c | 2 | ||||
-rw-r--r-- | kernel/sysctl.c | 10 |
12 files changed, 1439 insertions, 187 deletions
diff --git a/Documentation/scheduler/sched-bwc.txt b/Documentation/scheduler/sched-bwc.txt new file mode 100644 index 000000000000..f6b1873f68ab --- /dev/null +++ b/Documentation/scheduler/sched-bwc.txt | |||
@@ -0,0 +1,122 @@ | |||
1 | CFS Bandwidth Control | ||
2 | ===================== | ||
3 | |||
4 | [ This document only discusses CPU bandwidth control for SCHED_NORMAL. | ||
5 | The SCHED_RT case is covered in Documentation/scheduler/sched-rt-group.txt ] | ||
6 | |||
7 | CFS bandwidth control is a CONFIG_FAIR_GROUP_SCHED extension which allows the | ||
8 | specification of the maximum CPU bandwidth available to a group or hierarchy. | ||
9 | |||
10 | The bandwidth allowed for a group is specified using a quota and period. Within | ||
11 | each given "period" (microseconds), a group is allowed to consume only up to | ||
12 | "quota" microseconds of CPU time. When the CPU bandwidth consumption of a | ||
13 | group exceeds this limit (for that period), the tasks belonging to its | ||
14 | hierarchy will be throttled and are not allowed to run again until the next | ||
15 | period. | ||
16 | |||
17 | A group's unused runtime is globally tracked, being refreshed with quota units | ||
18 | above at each period boundary. As threads consume this bandwidth it is | ||
19 | transferred to cpu-local "silos" on a demand basis. The amount transferred | ||
20 | within each of these updates is tunable and described as the "slice". | ||
21 | |||
22 | Management | ||
23 | ---------- | ||
24 | Quota and period are managed within the cpu subsystem via cgroupfs. | ||
25 | |||
26 | cpu.cfs_quota_us: the total available run-time within a period (in microseconds) | ||
27 | cpu.cfs_period_us: the length of a period (in microseconds) | ||
28 | cpu.stat: exports throttling statistics [explained further below] | ||
29 | |||
30 | The default values are: | ||
31 | cpu.cfs_period_us=100ms | ||
32 | cpu.cfs_quota=-1 | ||
33 | |||
34 | A value of -1 for cpu.cfs_quota_us indicates that the group does not have any | ||
35 | bandwidth restriction in place, such a group is described as an unconstrained | ||
36 | bandwidth group. This represents the traditional work-conserving behavior for | ||
37 | CFS. | ||
38 | |||
39 | Writing any (valid) positive value(s) will enact the specified bandwidth limit. | ||
40 | The minimum quota allowed for the quota or period is 1ms. There is also an | ||
41 | upper bound on the period length of 1s. Additional restrictions exist when | ||
42 | bandwidth limits are used in a hierarchical fashion, these are explained in | ||
43 | more detail below. | ||
44 | |||
45 | Writing any negative value to cpu.cfs_quota_us will remove the bandwidth limit | ||
46 | and return the group to an unconstrained state once more. | ||
47 | |||
48 | Any updates to a group's bandwidth specification will result in it becoming | ||
49 | unthrottled if it is in a constrained state. | ||
50 | |||
51 | System wide settings | ||
52 | -------------------- | ||
53 | For efficiency run-time is transferred between the global pool and CPU local | ||
54 | "silos" in a batch fashion. This greatly reduces global accounting pressure | ||
55 | on large systems. The amount transferred each time such an update is required | ||
56 | is described as the "slice". | ||
57 | |||
58 | This is tunable via procfs: | ||
59 | /proc/sys/kernel/sched_cfs_bandwidth_slice_us (default=5ms) | ||
60 | |||
61 | Larger slice values will reduce transfer overheads, while smaller values allow | ||
62 | for more fine-grained consumption. | ||
63 | |||
64 | Statistics | ||
65 | ---------- | ||
66 | A group's bandwidth statistics are exported via 3 fields in cpu.stat. | ||
67 | |||
68 | cpu.stat: | ||
69 | - nr_periods: Number of enforcement intervals that have elapsed. | ||
70 | - nr_throttled: Number of times the group has been throttled/limited. | ||
71 | - throttled_time: The total time duration (in nanoseconds) for which entities | ||
72 | of the group have been throttled. | ||
73 | |||
74 | This interface is read-only. | ||
75 | |||
76 | Hierarchical considerations | ||
77 | --------------------------- | ||
78 | The interface enforces that an individual entity's bandwidth is always | ||
79 | attainable, that is: max(c_i) <= C. However, over-subscription in the | ||
80 | aggregate case is explicitly allowed to enable work-conserving semantics | ||
81 | within a hierarchy. | ||
82 | e.g. \Sum (c_i) may exceed C | ||
83 | [ Where C is the parent's bandwidth, and c_i its children ] | ||
84 | |||
85 | |||
86 | There are two ways in which a group may become throttled: | ||
87 | a. it fully consumes its own quota within a period | ||
88 | b. a parent's quota is fully consumed within its period | ||
89 | |||
90 | In case b) above, even though the child may have runtime remaining it will not | ||
91 | be allowed to until the parent's runtime is refreshed. | ||
92 | |||
93 | Examples | ||
94 | -------- | ||
95 | 1. Limit a group to 1 CPU worth of runtime. | ||
96 | |||
97 | If period is 250ms and quota is also 250ms, the group will get | ||
98 | 1 CPU worth of runtime every 250ms. | ||
99 | |||
100 | # echo 250000 > cpu.cfs_quota_us /* quota = 250ms */ | ||
101 | # echo 250000 > cpu.cfs_period_us /* period = 250ms */ | ||
102 | |||
103 | 2. Limit a group to 2 CPUs worth of runtime on a multi-CPU machine. | ||
104 | |||
105 | With 500ms period and 1000ms quota, the group can get 2 CPUs worth of | ||
106 | runtime every 500ms. | ||
107 | |||
108 | # echo 1000000 > cpu.cfs_quota_us /* quota = 1000ms */ | ||
109 | # echo 500000 > cpu.cfs_period_us /* period = 500ms */ | ||
110 | |||
111 | The larger period here allows for increased burst capacity. | ||
112 | |||
113 | 3. Limit a group to 20% of 1 CPU. | ||
114 | |||
115 | With 50ms period, 10ms quota will be equivalent to 20% of 1 CPU. | ||
116 | |||
117 | # echo 10000 > cpu.cfs_quota_us /* quota = 10ms */ | ||
118 | # echo 50000 > cpu.cfs_period_us /* period = 50ms */ | ||
119 | |||
120 | By using a small period here we are ensuring a consistent latency | ||
121 | response at the expense of burst capacity. | ||
122 | |||
diff --git a/include/linux/sched.h b/include/linux/sched.h index 41d0237fd449..9fda2888a6ab 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -2039,6 +2039,10 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { } | |||
2039 | static inline void sched_autogroup_exit(struct signal_struct *sig) { } | 2039 | static inline void sched_autogroup_exit(struct signal_struct *sig) { } |
2040 | #endif | 2040 | #endif |
2041 | 2041 | ||
2042 | #ifdef CONFIG_CFS_BANDWIDTH | ||
2043 | extern unsigned int sysctl_sched_cfs_bandwidth_slice; | ||
2044 | #endif | ||
2045 | |||
2042 | #ifdef CONFIG_RT_MUTEXES | 2046 | #ifdef CONFIG_RT_MUTEXES |
2043 | extern int rt_mutex_getprio(struct task_struct *p); | 2047 | extern int rt_mutex_getprio(struct task_struct *p); |
2044 | extern void rt_mutex_setprio(struct task_struct *p, int prio); | 2048 | extern void rt_mutex_setprio(struct task_struct *p, int prio); |
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index f6334782a593..959ff18b63b6 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h | |||
@@ -100,7 +100,7 @@ static inline long __trace_sched_switch_state(struct task_struct *p) | |||
100 | * For all intents and purposes a preempted task is a running task. | 100 | * For all intents and purposes a preempted task is a running task. |
101 | */ | 101 | */ |
102 | if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE) | 102 | if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE) |
103 | state = TASK_RUNNING; | 103 | state = TASK_RUNNING | TASK_STATE_MAX; |
104 | #endif | 104 | #endif |
105 | 105 | ||
106 | return state; | 106 | return state; |
@@ -137,13 +137,14 @@ TRACE_EVENT(sched_switch, | |||
137 | __entry->next_prio = next->prio; | 137 | __entry->next_prio = next->prio; |
138 | ), | 138 | ), |
139 | 139 | ||
140 | TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s ==> next_comm=%s next_pid=%d next_prio=%d", | 140 | TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", |
141 | __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, | 141 | __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, |
142 | __entry->prev_state ? | 142 | __entry->prev_state & (TASK_STATE_MAX-1) ? |
143 | __print_flags(__entry->prev_state, "|", | 143 | __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|", |
144 | { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, | 144 | { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, |
145 | { 16, "Z" }, { 32, "X" }, { 64, "x" }, | 145 | { 16, "Z" }, { 32, "X" }, { 64, "x" }, |
146 | { 128, "W" }) : "R", | 146 | { 128, "W" }) : "R", |
147 | __entry->prev_state & TASK_STATE_MAX ? "+" : "", | ||
147 | __entry->next_comm, __entry->next_pid, __entry->next_prio) | 148 | __entry->next_comm, __entry->next_pid, __entry->next_prio) |
148 | ); | 149 | ); |
149 | 150 | ||
diff --git a/init/Kconfig b/init/Kconfig index d62778390e55..d19b3a77ab44 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -715,6 +715,18 @@ config FAIR_GROUP_SCHED | |||
715 | depends on CGROUP_SCHED | 715 | depends on CGROUP_SCHED |
716 | default CGROUP_SCHED | 716 | default CGROUP_SCHED |
717 | 717 | ||
718 | config CFS_BANDWIDTH | ||
719 | bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" | ||
720 | depends on EXPERIMENTAL | ||
721 | depends on FAIR_GROUP_SCHED | ||
722 | default n | ||
723 | help | ||
724 | This option allows users to define CPU bandwidth rates (limits) for | ||
725 | tasks running within the fair group scheduler. Groups with no limit | ||
726 | set are considered to be unconstrained and will run with no | ||
727 | restriction. | ||
728 | See tip/Documentation/scheduler/sched-bwc.txt for more information. | ||
729 | |||
718 | config RT_GROUP_SCHED | 730 | config RT_GROUP_SCHED |
719 | bool "Group scheduling for SCHED_RR/FIFO" | 731 | bool "Group scheduling for SCHED_RR/FIFO" |
720 | depends on EXPERIMENTAL | 732 | depends on EXPERIMENTAL |
diff --git a/kernel/sched.c b/kernel/sched.c index b50b0f0c9aa9..c5cf15e1eb57 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void) | |||
196 | return sysctl_sched_rt_runtime >= 0; | 196 | return sysctl_sched_rt_runtime >= 0; |
197 | } | 197 | } |
198 | 198 | ||
199 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | 199 | static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) |
200 | { | 200 | { |
201 | ktime_t now; | 201 | unsigned long delta; |
202 | ktime_t soft, hard, now; | ||
203 | |||
204 | for (;;) { | ||
205 | if (hrtimer_active(period_timer)) | ||
206 | break; | ||
207 | |||
208 | now = hrtimer_cb_get_time(period_timer); | ||
209 | hrtimer_forward(period_timer, now, period); | ||
210 | |||
211 | soft = hrtimer_get_softexpires(period_timer); | ||
212 | hard = hrtimer_get_expires(period_timer); | ||
213 | delta = ktime_to_ns(ktime_sub(hard, soft)); | ||
214 | __hrtimer_start_range_ns(period_timer, soft, delta, | ||
215 | HRTIMER_MODE_ABS_PINNED, 0); | ||
216 | } | ||
217 | } | ||
202 | 218 | ||
219 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
220 | { | ||
203 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) | 221 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) |
204 | return; | 222 | return; |
205 | 223 | ||
@@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
207 | return; | 225 | return; |
208 | 226 | ||
209 | raw_spin_lock(&rt_b->rt_runtime_lock); | 227 | raw_spin_lock(&rt_b->rt_runtime_lock); |
210 | for (;;) { | 228 | start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); |
211 | unsigned long delta; | ||
212 | ktime_t soft, hard; | ||
213 | |||
214 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
215 | break; | ||
216 | |||
217 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); | ||
218 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); | ||
219 | |||
220 | soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); | ||
221 | hard = hrtimer_get_expires(&rt_b->rt_period_timer); | ||
222 | delta = ktime_to_ns(ktime_sub(hard, soft)); | ||
223 | __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, | ||
224 | HRTIMER_MODE_ABS_PINNED, 0); | ||
225 | } | ||
226 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 229 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
227 | } | 230 | } |
228 | 231 | ||
@@ -247,6 +250,24 @@ struct cfs_rq; | |||
247 | 250 | ||
248 | static LIST_HEAD(task_groups); | 251 | static LIST_HEAD(task_groups); |
249 | 252 | ||
253 | struct cfs_bandwidth { | ||
254 | #ifdef CONFIG_CFS_BANDWIDTH | ||
255 | raw_spinlock_t lock; | ||
256 | ktime_t period; | ||
257 | u64 quota, runtime; | ||
258 | s64 hierarchal_quota; | ||
259 | u64 runtime_expires; | ||
260 | |||
261 | int idle, timer_active; | ||
262 | struct hrtimer period_timer, slack_timer; | ||
263 | struct list_head throttled_cfs_rq; | ||
264 | |||
265 | /* statistics */ | ||
266 | int nr_periods, nr_throttled; | ||
267 | u64 throttled_time; | ||
268 | #endif | ||
269 | }; | ||
270 | |||
250 | /* task group related information */ | 271 | /* task group related information */ |
251 | struct task_group { | 272 | struct task_group { |
252 | struct cgroup_subsys_state css; | 273 | struct cgroup_subsys_state css; |
@@ -278,6 +299,8 @@ struct task_group { | |||
278 | #ifdef CONFIG_SCHED_AUTOGROUP | 299 | #ifdef CONFIG_SCHED_AUTOGROUP |
279 | struct autogroup *autogroup; | 300 | struct autogroup *autogroup; |
280 | #endif | 301 | #endif |
302 | |||
303 | struct cfs_bandwidth cfs_bandwidth; | ||
281 | }; | 304 | }; |
282 | 305 | ||
283 | /* task_group_lock serializes the addition/removal of task groups */ | 306 | /* task_group_lock serializes the addition/removal of task groups */ |
@@ -311,7 +334,7 @@ struct task_group root_task_group; | |||
311 | /* CFS-related fields in a runqueue */ | 334 | /* CFS-related fields in a runqueue */ |
312 | struct cfs_rq { | 335 | struct cfs_rq { |
313 | struct load_weight load; | 336 | struct load_weight load; |
314 | unsigned long nr_running; | 337 | unsigned long nr_running, h_nr_running; |
315 | 338 | ||
316 | u64 exec_clock; | 339 | u64 exec_clock; |
317 | u64 min_vruntime; | 340 | u64 min_vruntime; |
@@ -377,9 +400,120 @@ struct cfs_rq { | |||
377 | 400 | ||
378 | unsigned long load_contribution; | 401 | unsigned long load_contribution; |
379 | #endif | 402 | #endif |
403 | #ifdef CONFIG_CFS_BANDWIDTH | ||
404 | int runtime_enabled; | ||
405 | u64 runtime_expires; | ||
406 | s64 runtime_remaining; | ||
407 | |||
408 | u64 throttled_timestamp; | ||
409 | int throttled, throttle_count; | ||
410 | struct list_head throttled_list; | ||
411 | #endif | ||
380 | #endif | 412 | #endif |
381 | }; | 413 | }; |
382 | 414 | ||
415 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
416 | #ifdef CONFIG_CFS_BANDWIDTH | ||
417 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
418 | { | ||
419 | return &tg->cfs_bandwidth; | ||
420 | } | ||
421 | |||
422 | static inline u64 default_cfs_period(void); | ||
423 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); | ||
424 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); | ||
425 | |||
426 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) | ||
427 | { | ||
428 | struct cfs_bandwidth *cfs_b = | ||
429 | container_of(timer, struct cfs_bandwidth, slack_timer); | ||
430 | do_sched_cfs_slack_timer(cfs_b); | ||
431 | |||
432 | return HRTIMER_NORESTART; | ||
433 | } | ||
434 | |||
435 | static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | ||
436 | { | ||
437 | struct cfs_bandwidth *cfs_b = | ||
438 | container_of(timer, struct cfs_bandwidth, period_timer); | ||
439 | ktime_t now; | ||
440 | int overrun; | ||
441 | int idle = 0; | ||
442 | |||
443 | for (;;) { | ||
444 | now = hrtimer_cb_get_time(timer); | ||
445 | overrun = hrtimer_forward(timer, now, cfs_b->period); | ||
446 | |||
447 | if (!overrun) | ||
448 | break; | ||
449 | |||
450 | idle = do_sched_cfs_period_timer(cfs_b, overrun); | ||
451 | } | ||
452 | |||
453 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
454 | } | ||
455 | |||
456 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
457 | { | ||
458 | raw_spin_lock_init(&cfs_b->lock); | ||
459 | cfs_b->runtime = 0; | ||
460 | cfs_b->quota = RUNTIME_INF; | ||
461 | cfs_b->period = ns_to_ktime(default_cfs_period()); | ||
462 | |||
463 | INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); | ||
464 | hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
465 | cfs_b->period_timer.function = sched_cfs_period_timer; | ||
466 | hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
467 | cfs_b->slack_timer.function = sched_cfs_slack_timer; | ||
468 | } | ||
469 | |||
470 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
471 | { | ||
472 | cfs_rq->runtime_enabled = 0; | ||
473 | INIT_LIST_HEAD(&cfs_rq->throttled_list); | ||
474 | } | ||
475 | |||
476 | /* requires cfs_b->lock, may release to reprogram timer */ | ||
477 | static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
478 | { | ||
479 | /* | ||
480 | * The timer may be active because we're trying to set a new bandwidth | ||
481 | * period or because we're racing with the tear-down path | ||
482 | * (timer_active==0 becomes visible before the hrtimer call-back | ||
483 | * terminates). In either case we ensure that it's re-programmed | ||
484 | */ | ||
485 | while (unlikely(hrtimer_active(&cfs_b->period_timer))) { | ||
486 | raw_spin_unlock(&cfs_b->lock); | ||
487 | /* ensure cfs_b->lock is available while we wait */ | ||
488 | hrtimer_cancel(&cfs_b->period_timer); | ||
489 | |||
490 | raw_spin_lock(&cfs_b->lock); | ||
491 | /* if someone else restarted the timer then we're done */ | ||
492 | if (cfs_b->timer_active) | ||
493 | return; | ||
494 | } | ||
495 | |||
496 | cfs_b->timer_active = 1; | ||
497 | start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); | ||
498 | } | ||
499 | |||
500 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
501 | { | ||
502 | hrtimer_cancel(&cfs_b->period_timer); | ||
503 | hrtimer_cancel(&cfs_b->slack_timer); | ||
504 | } | ||
505 | #else | ||
506 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
507 | static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
508 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
509 | |||
510 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
511 | { | ||
512 | return NULL; | ||
513 | } | ||
514 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
515 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
516 | |||
383 | /* Real-Time classes' related field in a runqueue: */ | 517 | /* Real-Time classes' related field in a runqueue: */ |
384 | struct rt_rq { | 518 | struct rt_rq { |
385 | struct rt_prio_array active; | 519 | struct rt_prio_array active; |
@@ -520,8 +654,6 @@ struct rq { | |||
520 | int cpu; | 654 | int cpu; |
521 | int online; | 655 | int online; |
522 | 656 | ||
523 | unsigned long avg_load_per_task; | ||
524 | |||
525 | u64 rt_avg; | 657 | u64 rt_avg; |
526 | u64 age_stamp; | 658 | u64 age_stamp; |
527 | u64 idle_stamp; | 659 | u64 idle_stamp; |
@@ -1471,24 +1603,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) | |||
1471 | update_load_sub(&rq->load, load); | 1603 | update_load_sub(&rq->load, load); |
1472 | } | 1604 | } |
1473 | 1605 | ||
1474 | #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) | 1606 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ |
1607 | (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) | ||
1475 | typedef int (*tg_visitor)(struct task_group *, void *); | 1608 | typedef int (*tg_visitor)(struct task_group *, void *); |
1476 | 1609 | ||
1477 | /* | 1610 | /* |
1478 | * Iterate the full tree, calling @down when first entering a node and @up when | 1611 | * Iterate task_group tree rooted at *from, calling @down when first entering a |
1479 | * leaving it for the final time. | 1612 | * node and @up when leaving it for the final time. |
1613 | * | ||
1614 | * Caller must hold rcu_lock or sufficient equivalent. | ||
1480 | */ | 1615 | */ |
1481 | static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | 1616 | static int walk_tg_tree_from(struct task_group *from, |
1617 | tg_visitor down, tg_visitor up, void *data) | ||
1482 | { | 1618 | { |
1483 | struct task_group *parent, *child; | 1619 | struct task_group *parent, *child; |
1484 | int ret; | 1620 | int ret; |
1485 | 1621 | ||
1486 | rcu_read_lock(); | 1622 | parent = from; |
1487 | parent = &root_task_group; | 1623 | |
1488 | down: | 1624 | down: |
1489 | ret = (*down)(parent, data); | 1625 | ret = (*down)(parent, data); |
1490 | if (ret) | 1626 | if (ret) |
1491 | goto out_unlock; | 1627 | goto out; |
1492 | list_for_each_entry_rcu(child, &parent->children, siblings) { | 1628 | list_for_each_entry_rcu(child, &parent->children, siblings) { |
1493 | parent = child; | 1629 | parent = child; |
1494 | goto down; | 1630 | goto down; |
@@ -1497,19 +1633,29 @@ up: | |||
1497 | continue; | 1633 | continue; |
1498 | } | 1634 | } |
1499 | ret = (*up)(parent, data); | 1635 | ret = (*up)(parent, data); |
1500 | if (ret) | 1636 | if (ret || parent == from) |
1501 | goto out_unlock; | 1637 | goto out; |
1502 | 1638 | ||
1503 | child = parent; | 1639 | child = parent; |
1504 | parent = parent->parent; | 1640 | parent = parent->parent; |
1505 | if (parent) | 1641 | if (parent) |
1506 | goto up; | 1642 | goto up; |
1507 | out_unlock: | 1643 | out: |
1508 | rcu_read_unlock(); | ||
1509 | |||
1510 | return ret; | 1644 | return ret; |
1511 | } | 1645 | } |
1512 | 1646 | ||
1647 | /* | ||
1648 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
1649 | * leaving it for the final time. | ||
1650 | * | ||
1651 | * Caller must hold rcu_lock or sufficient equivalent. | ||
1652 | */ | ||
1653 | |||
1654 | static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | ||
1655 | { | ||
1656 | return walk_tg_tree_from(&root_task_group, down, up, data); | ||
1657 | } | ||
1658 | |||
1513 | static int tg_nop(struct task_group *tg, void *data) | 1659 | static int tg_nop(struct task_group *tg, void *data) |
1514 | { | 1660 | { |
1515 | return 0; | 1661 | return 0; |
@@ -1569,11 +1715,9 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1569 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | 1715 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); |
1570 | 1716 | ||
1571 | if (nr_running) | 1717 | if (nr_running) |
1572 | rq->avg_load_per_task = rq->load.weight / nr_running; | 1718 | return rq->load.weight / nr_running; |
1573 | else | ||
1574 | rq->avg_load_per_task = 0; | ||
1575 | 1719 | ||
1576 | return rq->avg_load_per_task; | 1720 | return 0; |
1577 | } | 1721 | } |
1578 | 1722 | ||
1579 | #ifdef CONFIG_PREEMPT | 1723 | #ifdef CONFIG_PREEMPT |
@@ -1806,7 +1950,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1806 | rq->nr_uninterruptible--; | 1950 | rq->nr_uninterruptible--; |
1807 | 1951 | ||
1808 | enqueue_task(rq, p, flags); | 1952 | enqueue_task(rq, p, flags); |
1809 | inc_nr_running(rq); | ||
1810 | } | 1953 | } |
1811 | 1954 | ||
1812 | /* | 1955 | /* |
@@ -1818,7 +1961,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1818 | rq->nr_uninterruptible++; | 1961 | rq->nr_uninterruptible++; |
1819 | 1962 | ||
1820 | dequeue_task(rq, p, flags); | 1963 | dequeue_task(rq, p, flags); |
1821 | dec_nr_running(rq); | ||
1822 | } | 1964 | } |
1823 | 1965 | ||
1824 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 1966 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
@@ -2848,19 +2990,23 @@ void sched_fork(struct task_struct *p) | |||
2848 | p->state = TASK_RUNNING; | 2990 | p->state = TASK_RUNNING; |
2849 | 2991 | ||
2850 | /* | 2992 | /* |
2993 | * Make sure we do not leak PI boosting priority to the child. | ||
2994 | */ | ||
2995 | p->prio = current->normal_prio; | ||
2996 | |||
2997 | /* | ||
2851 | * Revert to default priority/policy on fork if requested. | 2998 | * Revert to default priority/policy on fork if requested. |
2852 | */ | 2999 | */ |
2853 | if (unlikely(p->sched_reset_on_fork)) { | 3000 | if (unlikely(p->sched_reset_on_fork)) { |
2854 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { | 3001 | if (task_has_rt_policy(p)) { |
2855 | p->policy = SCHED_NORMAL; | 3002 | p->policy = SCHED_NORMAL; |
2856 | p->normal_prio = p->static_prio; | ||
2857 | } | ||
2858 | |||
2859 | if (PRIO_TO_NICE(p->static_prio) < 0) { | ||
2860 | p->static_prio = NICE_TO_PRIO(0); | 3003 | p->static_prio = NICE_TO_PRIO(0); |
2861 | p->normal_prio = p->static_prio; | 3004 | p->rt_priority = 0; |
2862 | set_load_weight(p); | 3005 | } else if (PRIO_TO_NICE(p->static_prio) < 0) |
2863 | } | 3006 | p->static_prio = NICE_TO_PRIO(0); |
3007 | |||
3008 | p->prio = p->normal_prio = __normal_prio(p); | ||
3009 | set_load_weight(p); | ||
2864 | 3010 | ||
2865 | /* | 3011 | /* |
2866 | * We don't need the reset flag anymore after the fork. It has | 3012 | * We don't need the reset flag anymore after the fork. It has |
@@ -2869,11 +3015,6 @@ void sched_fork(struct task_struct *p) | |||
2869 | p->sched_reset_on_fork = 0; | 3015 | p->sched_reset_on_fork = 0; |
2870 | } | 3016 | } |
2871 | 3017 | ||
2872 | /* | ||
2873 | * Make sure we do not leak PI boosting priority to the child. | ||
2874 | */ | ||
2875 | p->prio = current->normal_prio; | ||
2876 | |||
2877 | if (!rt_prio(p->prio)) | 3018 | if (!rt_prio(p->prio)) |
2878 | p->sched_class = &fair_sched_class; | 3019 | p->sched_class = &fair_sched_class; |
2879 | 3020 | ||
@@ -4239,7 +4380,7 @@ pick_next_task(struct rq *rq) | |||
4239 | * Optimization: we know that if all tasks are in | 4380 | * Optimization: we know that if all tasks are in |
4240 | * the fair class we can call that function directly: | 4381 | * the fair class we can call that function directly: |
4241 | */ | 4382 | */ |
4242 | if (likely(rq->nr_running == rq->cfs.nr_running)) { | 4383 | if (likely(rq->nr_running == rq->cfs.h_nr_running)) { |
4243 | p = fair_sched_class.pick_next_task(rq); | 4384 | p = fair_sched_class.pick_next_task(rq); |
4244 | if (likely(p)) | 4385 | if (likely(p)) |
4245 | return p; | 4386 | return p; |
@@ -6197,6 +6338,30 @@ static void calc_global_load_remove(struct rq *rq) | |||
6197 | rq->calc_load_active = 0; | 6338 | rq->calc_load_active = 0; |
6198 | } | 6339 | } |
6199 | 6340 | ||
6341 | #ifdef CONFIG_CFS_BANDWIDTH | ||
6342 | static void unthrottle_offline_cfs_rqs(struct rq *rq) | ||
6343 | { | ||
6344 | struct cfs_rq *cfs_rq; | ||
6345 | |||
6346 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
6347 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
6348 | |||
6349 | if (!cfs_rq->runtime_enabled) | ||
6350 | continue; | ||
6351 | |||
6352 | /* | ||
6353 | * clock_task is not advancing so we just need to make sure | ||
6354 | * there's some valid quota amount | ||
6355 | */ | ||
6356 | cfs_rq->runtime_remaining = cfs_b->quota; | ||
6357 | if (cfs_rq_throttled(cfs_rq)) | ||
6358 | unthrottle_cfs_rq(cfs_rq); | ||
6359 | } | ||
6360 | } | ||
6361 | #else | ||
6362 | static void unthrottle_offline_cfs_rqs(struct rq *rq) {} | ||
6363 | #endif | ||
6364 | |||
6200 | /* | 6365 | /* |
6201 | * Migrate all tasks from the rq, sleeping tasks will be migrated by | 6366 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
6202 | * try_to_wake_up()->select_task_rq(). | 6367 | * try_to_wake_up()->select_task_rq(). |
@@ -6222,6 +6387,9 @@ static void migrate_tasks(unsigned int dead_cpu) | |||
6222 | */ | 6387 | */ |
6223 | rq->stop = NULL; | 6388 | rq->stop = NULL; |
6224 | 6389 | ||
6390 | /* Ensure any throttled groups are reachable by pick_next_task */ | ||
6391 | unthrottle_offline_cfs_rqs(rq); | ||
6392 | |||
6225 | for ( ; ; ) { | 6393 | for ( ; ; ) { |
6226 | /* | 6394 | /* |
6227 | * There's this thread running, bail when that's the only | 6395 | * There's this thread running, bail when that's the only |
@@ -7965,6 +8133,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
7965 | /* allow initial update_cfs_load() to truncate */ | 8133 | /* allow initial update_cfs_load() to truncate */ |
7966 | cfs_rq->load_stamp = 1; | 8134 | cfs_rq->load_stamp = 1; |
7967 | #endif | 8135 | #endif |
8136 | init_cfs_rq_runtime(cfs_rq); | ||
7968 | 8137 | ||
7969 | tg->cfs_rq[cpu] = cfs_rq; | 8138 | tg->cfs_rq[cpu] = cfs_rq; |
7970 | tg->se[cpu] = se; | 8139 | tg->se[cpu] = se; |
@@ -8104,6 +8273,7 @@ void __init sched_init(void) | |||
8104 | * We achieve this by letting root_task_group's tasks sit | 8273 | * We achieve this by letting root_task_group's tasks sit |
8105 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). | 8274 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). |
8106 | */ | 8275 | */ |
8276 | init_cfs_bandwidth(&root_task_group.cfs_bandwidth); | ||
8107 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); | 8277 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); |
8108 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 8278 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8109 | 8279 | ||
@@ -8345,6 +8515,8 @@ static void free_fair_sched_group(struct task_group *tg) | |||
8345 | { | 8515 | { |
8346 | int i; | 8516 | int i; |
8347 | 8517 | ||
8518 | destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
8519 | |||
8348 | for_each_possible_cpu(i) { | 8520 | for_each_possible_cpu(i) { |
8349 | if (tg->cfs_rq) | 8521 | if (tg->cfs_rq) |
8350 | kfree(tg->cfs_rq[i]); | 8522 | kfree(tg->cfs_rq[i]); |
@@ -8372,6 +8544,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8372 | 8544 | ||
8373 | tg->shares = NICE_0_LOAD; | 8545 | tg->shares = NICE_0_LOAD; |
8374 | 8546 | ||
8547 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
8548 | |||
8375 | for_each_possible_cpu(i) { | 8549 | for_each_possible_cpu(i) { |
8376 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | 8550 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
8377 | GFP_KERNEL, cpu_to_node(i)); | 8551 | GFP_KERNEL, cpu_to_node(i)); |
@@ -8647,12 +8821,7 @@ unsigned long sched_group_shares(struct task_group *tg) | |||
8647 | } | 8821 | } |
8648 | #endif | 8822 | #endif |
8649 | 8823 | ||
8650 | #ifdef CONFIG_RT_GROUP_SCHED | 8824 | #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) |
8651 | /* | ||
8652 | * Ensure that the real time constraints are schedulable. | ||
8653 | */ | ||
8654 | static DEFINE_MUTEX(rt_constraints_mutex); | ||
8655 | |||
8656 | static unsigned long to_ratio(u64 period, u64 runtime) | 8825 | static unsigned long to_ratio(u64 period, u64 runtime) |
8657 | { | 8826 | { |
8658 | if (runtime == RUNTIME_INF) | 8827 | if (runtime == RUNTIME_INF) |
@@ -8660,6 +8829,13 @@ static unsigned long to_ratio(u64 period, u64 runtime) | |||
8660 | 8829 | ||
8661 | return div64_u64(runtime << 20, period); | 8830 | return div64_u64(runtime << 20, period); |
8662 | } | 8831 | } |
8832 | #endif | ||
8833 | |||
8834 | #ifdef CONFIG_RT_GROUP_SCHED | ||
8835 | /* | ||
8836 | * Ensure that the real time constraints are schedulable. | ||
8837 | */ | ||
8838 | static DEFINE_MUTEX(rt_constraints_mutex); | ||
8663 | 8839 | ||
8664 | /* Must be called with tasklist_lock held */ | 8840 | /* Must be called with tasklist_lock held */ |
8665 | static inline int tg_has_rt_tasks(struct task_group *tg) | 8841 | static inline int tg_has_rt_tasks(struct task_group *tg) |
@@ -8680,7 +8856,7 @@ struct rt_schedulable_data { | |||
8680 | u64 rt_runtime; | 8856 | u64 rt_runtime; |
8681 | }; | 8857 | }; |
8682 | 8858 | ||
8683 | static int tg_schedulable(struct task_group *tg, void *data) | 8859 | static int tg_rt_schedulable(struct task_group *tg, void *data) |
8684 | { | 8860 | { |
8685 | struct rt_schedulable_data *d = data; | 8861 | struct rt_schedulable_data *d = data; |
8686 | struct task_group *child; | 8862 | struct task_group *child; |
@@ -8738,16 +8914,22 @@ static int tg_schedulable(struct task_group *tg, void *data) | |||
8738 | 8914 | ||
8739 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8915 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
8740 | { | 8916 | { |
8917 | int ret; | ||
8918 | |||
8741 | struct rt_schedulable_data data = { | 8919 | struct rt_schedulable_data data = { |
8742 | .tg = tg, | 8920 | .tg = tg, |
8743 | .rt_period = period, | 8921 | .rt_period = period, |
8744 | .rt_runtime = runtime, | 8922 | .rt_runtime = runtime, |
8745 | }; | 8923 | }; |
8746 | 8924 | ||
8747 | return walk_tg_tree(tg_schedulable, tg_nop, &data); | 8925 | rcu_read_lock(); |
8926 | ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); | ||
8927 | rcu_read_unlock(); | ||
8928 | |||
8929 | return ret; | ||
8748 | } | 8930 | } |
8749 | 8931 | ||
8750 | static int tg_set_bandwidth(struct task_group *tg, | 8932 | static int tg_set_rt_bandwidth(struct task_group *tg, |
8751 | u64 rt_period, u64 rt_runtime) | 8933 | u64 rt_period, u64 rt_runtime) |
8752 | { | 8934 | { |
8753 | int i, err = 0; | 8935 | int i, err = 0; |
@@ -8786,7 +8968,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | |||
8786 | if (rt_runtime_us < 0) | 8968 | if (rt_runtime_us < 0) |
8787 | rt_runtime = RUNTIME_INF; | 8969 | rt_runtime = RUNTIME_INF; |
8788 | 8970 | ||
8789 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8971 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
8790 | } | 8972 | } |
8791 | 8973 | ||
8792 | long sched_group_rt_runtime(struct task_group *tg) | 8974 | long sched_group_rt_runtime(struct task_group *tg) |
@@ -8811,7 +8993,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | |||
8811 | if (rt_period == 0) | 8993 | if (rt_period == 0) |
8812 | return -EINVAL; | 8994 | return -EINVAL; |
8813 | 8995 | ||
8814 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8996 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
8815 | } | 8997 | } |
8816 | 8998 | ||
8817 | long sched_group_rt_period(struct task_group *tg) | 8999 | long sched_group_rt_period(struct task_group *tg) |
@@ -9001,6 +9183,238 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | |||
9001 | 9183 | ||
9002 | return (u64) scale_load_down(tg->shares); | 9184 | return (u64) scale_load_down(tg->shares); |
9003 | } | 9185 | } |
9186 | |||
9187 | #ifdef CONFIG_CFS_BANDWIDTH | ||
9188 | static DEFINE_MUTEX(cfs_constraints_mutex); | ||
9189 | |||
9190 | const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ | ||
9191 | const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ | ||
9192 | |||
9193 | static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); | ||
9194 | |||
9195 | static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | ||
9196 | { | ||
9197 | int i, ret = 0, runtime_enabled; | ||
9198 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
9199 | |||
9200 | if (tg == &root_task_group) | ||
9201 | return -EINVAL; | ||
9202 | |||
9203 | /* | ||
9204 | * Ensure we have at some amount of bandwidth every period. This is | ||
9205 | * to prevent reaching a state of large arrears when throttled via | ||
9206 | * entity_tick() resulting in prolonged exit starvation. | ||
9207 | */ | ||
9208 | if (quota < min_cfs_quota_period || period < min_cfs_quota_period) | ||
9209 | return -EINVAL; | ||
9210 | |||
9211 | /* | ||
9212 | * Likewise, bound things on the otherside by preventing insane quota | ||
9213 | * periods. This also allows us to normalize in computing quota | ||
9214 | * feasibility. | ||
9215 | */ | ||
9216 | if (period > max_cfs_quota_period) | ||
9217 | return -EINVAL; | ||
9218 | |||
9219 | mutex_lock(&cfs_constraints_mutex); | ||
9220 | ret = __cfs_schedulable(tg, period, quota); | ||
9221 | if (ret) | ||
9222 | goto out_unlock; | ||
9223 | |||
9224 | runtime_enabled = quota != RUNTIME_INF; | ||
9225 | raw_spin_lock_irq(&cfs_b->lock); | ||
9226 | cfs_b->period = ns_to_ktime(period); | ||
9227 | cfs_b->quota = quota; | ||
9228 | |||
9229 | __refill_cfs_bandwidth_runtime(cfs_b); | ||
9230 | /* restart the period timer (if active) to handle new period expiry */ | ||
9231 | if (runtime_enabled && cfs_b->timer_active) { | ||
9232 | /* force a reprogram */ | ||
9233 | cfs_b->timer_active = 0; | ||
9234 | __start_cfs_bandwidth(cfs_b); | ||
9235 | } | ||
9236 | raw_spin_unlock_irq(&cfs_b->lock); | ||
9237 | |||
9238 | for_each_possible_cpu(i) { | ||
9239 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; | ||
9240 | struct rq *rq = rq_of(cfs_rq); | ||
9241 | |||
9242 | raw_spin_lock_irq(&rq->lock); | ||
9243 | cfs_rq->runtime_enabled = runtime_enabled; | ||
9244 | cfs_rq->runtime_remaining = 0; | ||
9245 | |||
9246 | if (cfs_rq_throttled(cfs_rq)) | ||
9247 | unthrottle_cfs_rq(cfs_rq); | ||
9248 | raw_spin_unlock_irq(&rq->lock); | ||
9249 | } | ||
9250 | out_unlock: | ||
9251 | mutex_unlock(&cfs_constraints_mutex); | ||
9252 | |||
9253 | return ret; | ||
9254 | } | ||
9255 | |||
9256 | int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) | ||
9257 | { | ||
9258 | u64 quota, period; | ||
9259 | |||
9260 | period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | ||
9261 | if (cfs_quota_us < 0) | ||
9262 | quota = RUNTIME_INF; | ||
9263 | else | ||
9264 | quota = (u64)cfs_quota_us * NSEC_PER_USEC; | ||
9265 | |||
9266 | return tg_set_cfs_bandwidth(tg, period, quota); | ||
9267 | } | ||
9268 | |||
9269 | long tg_get_cfs_quota(struct task_group *tg) | ||
9270 | { | ||
9271 | u64 quota_us; | ||
9272 | |||
9273 | if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) | ||
9274 | return -1; | ||
9275 | |||
9276 | quota_us = tg_cfs_bandwidth(tg)->quota; | ||
9277 | do_div(quota_us, NSEC_PER_USEC); | ||
9278 | |||
9279 | return quota_us; | ||
9280 | } | ||
9281 | |||
9282 | int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) | ||
9283 | { | ||
9284 | u64 quota, period; | ||
9285 | |||
9286 | period = (u64)cfs_period_us * NSEC_PER_USEC; | ||
9287 | quota = tg_cfs_bandwidth(tg)->quota; | ||
9288 | |||
9289 | if (period <= 0) | ||
9290 | return -EINVAL; | ||
9291 | |||
9292 | return tg_set_cfs_bandwidth(tg, period, quota); | ||
9293 | } | ||
9294 | |||
9295 | long tg_get_cfs_period(struct task_group *tg) | ||
9296 | { | ||
9297 | u64 cfs_period_us; | ||
9298 | |||
9299 | cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); | ||
9300 | do_div(cfs_period_us, NSEC_PER_USEC); | ||
9301 | |||
9302 | return cfs_period_us; | ||
9303 | } | ||
9304 | |||
9305 | static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) | ||
9306 | { | ||
9307 | return tg_get_cfs_quota(cgroup_tg(cgrp)); | ||
9308 | } | ||
9309 | |||
9310 | static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, | ||
9311 | s64 cfs_quota_us) | ||
9312 | { | ||
9313 | return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); | ||
9314 | } | ||
9315 | |||
9316 | static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) | ||
9317 | { | ||
9318 | return tg_get_cfs_period(cgroup_tg(cgrp)); | ||
9319 | } | ||
9320 | |||
9321 | static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, | ||
9322 | u64 cfs_period_us) | ||
9323 | { | ||
9324 | return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); | ||
9325 | } | ||
9326 | |||
9327 | struct cfs_schedulable_data { | ||
9328 | struct task_group *tg; | ||
9329 | u64 period, quota; | ||
9330 | }; | ||
9331 | |||
9332 | /* | ||
9333 | * normalize group quota/period to be quota/max_period | ||
9334 | * note: units are usecs | ||
9335 | */ | ||
9336 | static u64 normalize_cfs_quota(struct task_group *tg, | ||
9337 | struct cfs_schedulable_data *d) | ||
9338 | { | ||
9339 | u64 quota, period; | ||
9340 | |||
9341 | if (tg == d->tg) { | ||
9342 | period = d->period; | ||
9343 | quota = d->quota; | ||
9344 | } else { | ||
9345 | period = tg_get_cfs_period(tg); | ||
9346 | quota = tg_get_cfs_quota(tg); | ||
9347 | } | ||
9348 | |||
9349 | /* note: these should typically be equivalent */ | ||
9350 | if (quota == RUNTIME_INF || quota == -1) | ||
9351 | return RUNTIME_INF; | ||
9352 | |||
9353 | return to_ratio(period, quota); | ||
9354 | } | ||
9355 | |||
9356 | static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | ||
9357 | { | ||
9358 | struct cfs_schedulable_data *d = data; | ||
9359 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
9360 | s64 quota = 0, parent_quota = -1; | ||
9361 | |||
9362 | if (!tg->parent) { | ||
9363 | quota = RUNTIME_INF; | ||
9364 | } else { | ||
9365 | struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); | ||
9366 | |||
9367 | quota = normalize_cfs_quota(tg, d); | ||
9368 | parent_quota = parent_b->hierarchal_quota; | ||
9369 | |||
9370 | /* | ||
9371 | * ensure max(child_quota) <= parent_quota, inherit when no | ||
9372 | * limit is set | ||
9373 | */ | ||
9374 | if (quota == RUNTIME_INF) | ||
9375 | quota = parent_quota; | ||
9376 | else if (parent_quota != RUNTIME_INF && quota > parent_quota) | ||
9377 | return -EINVAL; | ||
9378 | } | ||
9379 | cfs_b->hierarchal_quota = quota; | ||
9380 | |||
9381 | return 0; | ||
9382 | } | ||
9383 | |||
9384 | static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) | ||
9385 | { | ||
9386 | int ret; | ||
9387 | struct cfs_schedulable_data data = { | ||
9388 | .tg = tg, | ||
9389 | .period = period, | ||
9390 | .quota = quota, | ||
9391 | }; | ||
9392 | |||
9393 | if (quota != RUNTIME_INF) { | ||
9394 | do_div(data.period, NSEC_PER_USEC); | ||
9395 | do_div(data.quota, NSEC_PER_USEC); | ||
9396 | } | ||
9397 | |||
9398 | rcu_read_lock(); | ||
9399 | ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); | ||
9400 | rcu_read_unlock(); | ||
9401 | |||
9402 | return ret; | ||
9403 | } | ||
9404 | |||
9405 | static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, | ||
9406 | struct cgroup_map_cb *cb) | ||
9407 | { | ||
9408 | struct task_group *tg = cgroup_tg(cgrp); | ||
9409 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
9410 | |||
9411 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); | ||
9412 | cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); | ||
9413 | cb->fill(cb, "throttled_time", cfs_b->throttled_time); | ||
9414 | |||
9415 | return 0; | ||
9416 | } | ||
9417 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
9004 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 9418 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
9005 | 9419 | ||
9006 | #ifdef CONFIG_RT_GROUP_SCHED | 9420 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -9035,6 +9449,22 @@ static struct cftype cpu_files[] = { | |||
9035 | .write_u64 = cpu_shares_write_u64, | 9449 | .write_u64 = cpu_shares_write_u64, |
9036 | }, | 9450 | }, |
9037 | #endif | 9451 | #endif |
9452 | #ifdef CONFIG_CFS_BANDWIDTH | ||
9453 | { | ||
9454 | .name = "cfs_quota_us", | ||
9455 | .read_s64 = cpu_cfs_quota_read_s64, | ||
9456 | .write_s64 = cpu_cfs_quota_write_s64, | ||
9457 | }, | ||
9458 | { | ||
9459 | .name = "cfs_period_us", | ||
9460 | .read_u64 = cpu_cfs_period_read_u64, | ||
9461 | .write_u64 = cpu_cfs_period_write_u64, | ||
9462 | }, | ||
9463 | { | ||
9464 | .name = "stat", | ||
9465 | .read_map = cpu_stats_show, | ||
9466 | }, | ||
9467 | #endif | ||
9038 | #ifdef CONFIG_RT_GROUP_SCHED | 9468 | #ifdef CONFIG_RT_GROUP_SCHED |
9039 | { | 9469 | { |
9040 | .name = "rt_runtime_us", | 9470 | .name = "rt_runtime_us", |
@@ -9344,4 +9774,3 @@ struct cgroup_subsys cpuacct_subsys = { | |||
9344 | .subsys_id = cpuacct_subsys_id, | 9774 | .subsys_id = cpuacct_subsys_id, |
9345 | }; | 9775 | }; |
9346 | #endif /* CONFIG_CGROUP_CPUACCT */ | 9776 | #endif /* CONFIG_CGROUP_CPUACCT */ |
9347 | |||
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 2722dc1b4138..a86cf9d9eb11 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c | |||
@@ -47,9 +47,6 @@ static int convert_prio(int prio) | |||
47 | return cpupri; | 47 | return cpupri; |
48 | } | 48 | } |
49 | 49 | ||
50 | #define for_each_cpupri_active(array, idx) \ | ||
51 | for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES) | ||
52 | |||
53 | /** | 50 | /** |
54 | * cpupri_find - find the best (lowest-pri) CPU in the system | 51 | * cpupri_find - find the best (lowest-pri) CPU in the system |
55 | * @cp: The cpupri context | 52 | * @cp: The cpupri context |
@@ -71,11 +68,38 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, | |||
71 | int idx = 0; | 68 | int idx = 0; |
72 | int task_pri = convert_prio(p->prio); | 69 | int task_pri = convert_prio(p->prio); |
73 | 70 | ||
74 | for_each_cpupri_active(cp->pri_active, idx) { | 71 | if (task_pri >= MAX_RT_PRIO) |
75 | struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; | 72 | return 0; |
76 | 73 | ||
77 | if (idx >= task_pri) | 74 | for (idx = 0; idx < task_pri; idx++) { |
78 | break; | 75 | struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; |
76 | int skip = 0; | ||
77 | |||
78 | if (!atomic_read(&(vec)->count)) | ||
79 | skip = 1; | ||
80 | /* | ||
81 | * When looking at the vector, we need to read the counter, | ||
82 | * do a memory barrier, then read the mask. | ||
83 | * | ||
84 | * Note: This is still all racey, but we can deal with it. | ||
85 | * Ideally, we only want to look at masks that are set. | ||
86 | * | ||
87 | * If a mask is not set, then the only thing wrong is that we | ||
88 | * did a little more work than necessary. | ||
89 | * | ||
90 | * If we read a zero count but the mask is set, because of the | ||
91 | * memory barriers, that can only happen when the highest prio | ||
92 | * task for a run queue has left the run queue, in which case, | ||
93 | * it will be followed by a pull. If the task we are processing | ||
94 | * fails to find a proper place to go, that pull request will | ||
95 | * pull this task if the run queue is running at a lower | ||
96 | * priority. | ||
97 | */ | ||
98 | smp_rmb(); | ||
99 | |||
100 | /* Need to do the rmb for every iteration */ | ||
101 | if (skip) | ||
102 | continue; | ||
79 | 103 | ||
80 | if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) | 104 | if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) |
81 | continue; | 105 | continue; |
@@ -115,7 +139,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
115 | { | 139 | { |
116 | int *currpri = &cp->cpu_to_pri[cpu]; | 140 | int *currpri = &cp->cpu_to_pri[cpu]; |
117 | int oldpri = *currpri; | 141 | int oldpri = *currpri; |
118 | unsigned long flags; | 142 | int do_mb = 0; |
119 | 143 | ||
120 | newpri = convert_prio(newpri); | 144 | newpri = convert_prio(newpri); |
121 | 145 | ||
@@ -128,32 +152,46 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
128 | * If the cpu was currently mapped to a different value, we | 152 | * If the cpu was currently mapped to a different value, we |
129 | * need to map it to the new value then remove the old value. | 153 | * need to map it to the new value then remove the old value. |
130 | * Note, we must add the new value first, otherwise we risk the | 154 | * Note, we must add the new value first, otherwise we risk the |
131 | * cpu being cleared from pri_active, and this cpu could be | 155 | * cpu being missed by the priority loop in cpupri_find. |
132 | * missed for a push or pull. | ||
133 | */ | 156 | */ |
134 | if (likely(newpri != CPUPRI_INVALID)) { | 157 | if (likely(newpri != CPUPRI_INVALID)) { |
135 | struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; | 158 | struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; |
136 | 159 | ||
137 | raw_spin_lock_irqsave(&vec->lock, flags); | ||
138 | |||
139 | cpumask_set_cpu(cpu, vec->mask); | 160 | cpumask_set_cpu(cpu, vec->mask); |
140 | vec->count++; | 161 | /* |
141 | if (vec->count == 1) | 162 | * When adding a new vector, we update the mask first, |
142 | set_bit(newpri, cp->pri_active); | 163 | * do a write memory barrier, and then update the count, to |
143 | 164 | * make sure the vector is visible when count is set. | |
144 | raw_spin_unlock_irqrestore(&vec->lock, flags); | 165 | */ |
166 | smp_mb__before_atomic_inc(); | ||
167 | atomic_inc(&(vec)->count); | ||
168 | do_mb = 1; | ||
145 | } | 169 | } |
146 | if (likely(oldpri != CPUPRI_INVALID)) { | 170 | if (likely(oldpri != CPUPRI_INVALID)) { |
147 | struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; | 171 | struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; |
148 | 172 | ||
149 | raw_spin_lock_irqsave(&vec->lock, flags); | 173 | /* |
150 | 174 | * Because the order of modification of the vec->count | |
151 | vec->count--; | 175 | * is important, we must make sure that the update |
152 | if (!vec->count) | 176 | * of the new prio is seen before we decrement the |
153 | clear_bit(oldpri, cp->pri_active); | 177 | * old prio. This makes sure that the loop sees |
178 | * one or the other when we raise the priority of | ||
179 | * the run queue. We don't care about when we lower the | ||
180 | * priority, as that will trigger an rt pull anyway. | ||
181 | * | ||
182 | * We only need to do a memory barrier if we updated | ||
183 | * the new priority vec. | ||
184 | */ | ||
185 | if (do_mb) | ||
186 | smp_mb__after_atomic_inc(); | ||
187 | |||
188 | /* | ||
189 | * When removing from the vector, we decrement the counter first | ||
190 | * do a memory barrier and then clear the mask. | ||
191 | */ | ||
192 | atomic_dec(&(vec)->count); | ||
193 | smp_mb__after_atomic_inc(); | ||
154 | cpumask_clear_cpu(cpu, vec->mask); | 194 | cpumask_clear_cpu(cpu, vec->mask); |
155 | |||
156 | raw_spin_unlock_irqrestore(&vec->lock, flags); | ||
157 | } | 195 | } |
158 | 196 | ||
159 | *currpri = newpri; | 197 | *currpri = newpri; |
@@ -175,8 +213,7 @@ int cpupri_init(struct cpupri *cp) | |||
175 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { | 213 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { |
176 | struct cpupri_vec *vec = &cp->pri_to_cpu[i]; | 214 | struct cpupri_vec *vec = &cp->pri_to_cpu[i]; |
177 | 215 | ||
178 | raw_spin_lock_init(&vec->lock); | 216 | atomic_set(&vec->count, 0); |
179 | vec->count = 0; | ||
180 | if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) | 217 | if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) |
181 | goto cleanup; | 218 | goto cleanup; |
182 | } | 219 | } |
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 9fc7d386fea4..f6d756173491 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h | |||
@@ -4,7 +4,6 @@ | |||
4 | #include <linux/sched.h> | 4 | #include <linux/sched.h> |
5 | 5 | ||
6 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) | 6 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) |
7 | #define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES) | ||
8 | 7 | ||
9 | #define CPUPRI_INVALID -1 | 8 | #define CPUPRI_INVALID -1 |
10 | #define CPUPRI_IDLE 0 | 9 | #define CPUPRI_IDLE 0 |
@@ -12,14 +11,12 @@ | |||
12 | /* values 2-101 are RT priorities 0-99 */ | 11 | /* values 2-101 are RT priorities 0-99 */ |
13 | 12 | ||
14 | struct cpupri_vec { | 13 | struct cpupri_vec { |
15 | raw_spinlock_t lock; | 14 | atomic_t count; |
16 | int count; | 15 | cpumask_var_t mask; |
17 | cpumask_var_t mask; | ||
18 | }; | 16 | }; |
19 | 17 | ||
20 | struct cpupri { | 18 | struct cpupri { |
21 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; | 19 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; |
22 | long pri_active[CPUPRI_NR_PRI_WORDS]; | ||
23 | int cpu_to_pri[NR_CPUS]; | 20 | int cpu_to_pri[NR_CPUS]; |
24 | }; | 21 | }; |
25 | 22 | ||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index bc8ee9993814..fef0bfde7c8c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | |||
89 | */ | 89 | */ |
90 | unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | 90 | unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; |
91 | 91 | ||
92 | #ifdef CONFIG_CFS_BANDWIDTH | ||
93 | /* | ||
94 | * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool | ||
95 | * each time a cfs_rq requests quota. | ||
96 | * | ||
97 | * Note: in the case that the slice exceeds the runtime remaining (either due | ||
98 | * to consumption or the quota being specified to be smaller than the slice) | ||
99 | * we will always only issue the remaining available time. | ||
100 | * | ||
101 | * default: 5 msec, units: microseconds | ||
102 | */ | ||
103 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; | ||
104 | #endif | ||
105 | |||
92 | static const struct sched_class fair_sched_class; | 106 | static const struct sched_class fair_sched_class; |
93 | 107 | ||
94 | /************************************************************** | 108 | /************************************************************** |
@@ -292,6 +306,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) | |||
292 | 306 | ||
293 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 307 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
294 | 308 | ||
309 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
310 | unsigned long delta_exec); | ||
295 | 311 | ||
296 | /************************************************************** | 312 | /************************************************************** |
297 | * Scheduling class tree data structure manipulation methods: | 313 | * Scheduling class tree data structure manipulation methods: |
@@ -583,6 +599,8 @@ static void update_curr(struct cfs_rq *cfs_rq) | |||
583 | cpuacct_charge(curtask, delta_exec); | 599 | cpuacct_charge(curtask, delta_exec); |
584 | account_group_exec_runtime(curtask, delta_exec); | 600 | account_group_exec_runtime(curtask, delta_exec); |
585 | } | 601 | } |
602 | |||
603 | account_cfs_rq_runtime(cfs_rq, delta_exec); | ||
586 | } | 604 | } |
587 | 605 | ||
588 | static inline void | 606 | static inline void |
@@ -688,6 +706,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
688 | } | 706 | } |
689 | 707 | ||
690 | #ifdef CONFIG_FAIR_GROUP_SCHED | 708 | #ifdef CONFIG_FAIR_GROUP_SCHED |
709 | /* we need this in update_cfs_load and load-balance functions below */ | ||
710 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); | ||
691 | # ifdef CONFIG_SMP | 711 | # ifdef CONFIG_SMP |
692 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, | 712 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, |
693 | int global_update) | 713 | int global_update) |
@@ -710,7 +730,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
710 | u64 now, delta; | 730 | u64 now, delta; |
711 | unsigned long load = cfs_rq->load.weight; | 731 | unsigned long load = cfs_rq->load.weight; |
712 | 732 | ||
713 | if (cfs_rq->tg == &root_task_group) | 733 | if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) |
714 | return; | 734 | return; |
715 | 735 | ||
716 | now = rq_of(cfs_rq)->clock_task; | 736 | now = rq_of(cfs_rq)->clock_task; |
@@ -819,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) | |||
819 | 839 | ||
820 | tg = cfs_rq->tg; | 840 | tg = cfs_rq->tg; |
821 | se = tg->se[cpu_of(rq_of(cfs_rq))]; | 841 | se = tg->se[cpu_of(rq_of(cfs_rq))]; |
822 | if (!se) | 842 | if (!se || throttled_hierarchy(cfs_rq)) |
823 | return; | 843 | return; |
824 | #ifndef CONFIG_SMP | 844 | #ifndef CONFIG_SMP |
825 | if (likely(se->load.weight == tg->shares)) | 845 | if (likely(se->load.weight == tg->shares)) |
@@ -950,6 +970,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
950 | se->vruntime = vruntime; | 970 | se->vruntime = vruntime; |
951 | } | 971 | } |
952 | 972 | ||
973 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq); | ||
974 | |||
953 | static void | 975 | static void |
954 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 976 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
955 | { | 977 | { |
@@ -979,8 +1001,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
979 | __enqueue_entity(cfs_rq, se); | 1001 | __enqueue_entity(cfs_rq, se); |
980 | se->on_rq = 1; | 1002 | se->on_rq = 1; |
981 | 1003 | ||
982 | if (cfs_rq->nr_running == 1) | 1004 | if (cfs_rq->nr_running == 1) { |
983 | list_add_leaf_cfs_rq(cfs_rq); | 1005 | list_add_leaf_cfs_rq(cfs_rq); |
1006 | check_enqueue_throttle(cfs_rq); | ||
1007 | } | ||
984 | } | 1008 | } |
985 | 1009 | ||
986 | static void __clear_buddies_last(struct sched_entity *se) | 1010 | static void __clear_buddies_last(struct sched_entity *se) |
@@ -1028,6 +1052,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1028 | __clear_buddies_skip(se); | 1052 | __clear_buddies_skip(se); |
1029 | } | 1053 | } |
1030 | 1054 | ||
1055 | static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); | ||
1056 | |||
1031 | static void | 1057 | static void |
1032 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 1058 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
1033 | { | 1059 | { |
@@ -1066,6 +1092,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1066 | if (!(flags & DEQUEUE_SLEEP)) | 1092 | if (!(flags & DEQUEUE_SLEEP)) |
1067 | se->vruntime -= cfs_rq->min_vruntime; | 1093 | se->vruntime -= cfs_rq->min_vruntime; |
1068 | 1094 | ||
1095 | /* return excess runtime on last dequeue */ | ||
1096 | return_cfs_rq_runtime(cfs_rq); | ||
1097 | |||
1069 | update_min_vruntime(cfs_rq); | 1098 | update_min_vruntime(cfs_rq); |
1070 | update_cfs_shares(cfs_rq); | 1099 | update_cfs_shares(cfs_rq); |
1071 | } | 1100 | } |
@@ -1077,6 +1106,8 @@ static void | |||
1077 | check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 1106 | check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) |
1078 | { | 1107 | { |
1079 | unsigned long ideal_runtime, delta_exec; | 1108 | unsigned long ideal_runtime, delta_exec; |
1109 | struct sched_entity *se; | ||
1110 | s64 delta; | ||
1080 | 1111 | ||
1081 | ideal_runtime = sched_slice(cfs_rq, curr); | 1112 | ideal_runtime = sched_slice(cfs_rq, curr); |
1082 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; | 1113 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; |
@@ -1095,22 +1126,17 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
1095 | * narrow margin doesn't have to wait for a full slice. | 1126 | * narrow margin doesn't have to wait for a full slice. |
1096 | * This also mitigates buddy induced latencies under load. | 1127 | * This also mitigates buddy induced latencies under load. |
1097 | */ | 1128 | */ |
1098 | if (!sched_feat(WAKEUP_PREEMPT)) | ||
1099 | return; | ||
1100 | |||
1101 | if (delta_exec < sysctl_sched_min_granularity) | 1129 | if (delta_exec < sysctl_sched_min_granularity) |
1102 | return; | 1130 | return; |
1103 | 1131 | ||
1104 | if (cfs_rq->nr_running > 1) { | 1132 | se = __pick_first_entity(cfs_rq); |
1105 | struct sched_entity *se = __pick_first_entity(cfs_rq); | 1133 | delta = curr->vruntime - se->vruntime; |
1106 | s64 delta = curr->vruntime - se->vruntime; | ||
1107 | 1134 | ||
1108 | if (delta < 0) | 1135 | if (delta < 0) |
1109 | return; | 1136 | return; |
1110 | 1137 | ||
1111 | if (delta > ideal_runtime) | 1138 | if (delta > ideal_runtime) |
1112 | resched_task(rq_of(cfs_rq)->curr); | 1139 | resched_task(rq_of(cfs_rq)->curr); |
1113 | } | ||
1114 | } | 1140 | } |
1115 | 1141 | ||
1116 | static void | 1142 | static void |
@@ -1185,6 +1211,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | |||
1185 | return se; | 1211 | return se; |
1186 | } | 1212 | } |
1187 | 1213 | ||
1214 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); | ||
1215 | |||
1188 | static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | 1216 | static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) |
1189 | { | 1217 | { |
1190 | /* | 1218 | /* |
@@ -1194,6 +1222,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
1194 | if (prev->on_rq) | 1222 | if (prev->on_rq) |
1195 | update_curr(cfs_rq); | 1223 | update_curr(cfs_rq); |
1196 | 1224 | ||
1225 | /* throttle cfs_rqs exceeding runtime */ | ||
1226 | check_cfs_rq_runtime(cfs_rq); | ||
1227 | |||
1197 | check_spread(cfs_rq, prev); | 1228 | check_spread(cfs_rq, prev); |
1198 | if (prev->on_rq) { | 1229 | if (prev->on_rq) { |
1199 | update_stats_wait_start(cfs_rq, prev); | 1230 | update_stats_wait_start(cfs_rq, prev); |
@@ -1233,10 +1264,583 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
1233 | return; | 1264 | return; |
1234 | #endif | 1265 | #endif |
1235 | 1266 | ||
1236 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) | 1267 | if (cfs_rq->nr_running > 1) |
1237 | check_preempt_tick(cfs_rq, curr); | 1268 | check_preempt_tick(cfs_rq, curr); |
1238 | } | 1269 | } |
1239 | 1270 | ||
1271 | |||
1272 | /************************************************** | ||
1273 | * CFS bandwidth control machinery | ||
1274 | */ | ||
1275 | |||
1276 | #ifdef CONFIG_CFS_BANDWIDTH | ||
1277 | /* | ||
1278 | * default period for cfs group bandwidth. | ||
1279 | * default: 0.1s, units: nanoseconds | ||
1280 | */ | ||
1281 | static inline u64 default_cfs_period(void) | ||
1282 | { | ||
1283 | return 100000000ULL; | ||
1284 | } | ||
1285 | |||
1286 | static inline u64 sched_cfs_bandwidth_slice(void) | ||
1287 | { | ||
1288 | return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC; | ||
1289 | } | ||
1290 | |||
1291 | /* | ||
1292 | * Replenish runtime according to assigned quota and update expiration time. | ||
1293 | * We use sched_clock_cpu directly instead of rq->clock to avoid adding | ||
1294 | * additional synchronization around rq->lock. | ||
1295 | * | ||
1296 | * requires cfs_b->lock | ||
1297 | */ | ||
1298 | static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) | ||
1299 | { | ||
1300 | u64 now; | ||
1301 | |||
1302 | if (cfs_b->quota == RUNTIME_INF) | ||
1303 | return; | ||
1304 | |||
1305 | now = sched_clock_cpu(smp_processor_id()); | ||
1306 | cfs_b->runtime = cfs_b->quota; | ||
1307 | cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); | ||
1308 | } | ||
1309 | |||
1310 | /* returns 0 on failure to allocate runtime */ | ||
1311 | static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1312 | { | ||
1313 | struct task_group *tg = cfs_rq->tg; | ||
1314 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
1315 | u64 amount = 0, min_amount, expires; | ||
1316 | |||
1317 | /* note: this is a positive sum as runtime_remaining <= 0 */ | ||
1318 | min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; | ||
1319 | |||
1320 | raw_spin_lock(&cfs_b->lock); | ||
1321 | if (cfs_b->quota == RUNTIME_INF) | ||
1322 | amount = min_amount; | ||
1323 | else { | ||
1324 | /* | ||
1325 | * If the bandwidth pool has become inactive, then at least one | ||
1326 | * period must have elapsed since the last consumption. | ||
1327 | * Refresh the global state and ensure bandwidth timer becomes | ||
1328 | * active. | ||
1329 | */ | ||
1330 | if (!cfs_b->timer_active) { | ||
1331 | __refill_cfs_bandwidth_runtime(cfs_b); | ||
1332 | __start_cfs_bandwidth(cfs_b); | ||
1333 | } | ||
1334 | |||
1335 | if (cfs_b->runtime > 0) { | ||
1336 | amount = min(cfs_b->runtime, min_amount); | ||
1337 | cfs_b->runtime -= amount; | ||
1338 | cfs_b->idle = 0; | ||
1339 | } | ||
1340 | } | ||
1341 | expires = cfs_b->runtime_expires; | ||
1342 | raw_spin_unlock(&cfs_b->lock); | ||
1343 | |||
1344 | cfs_rq->runtime_remaining += amount; | ||
1345 | /* | ||
1346 | * we may have advanced our local expiration to account for allowed | ||
1347 | * spread between our sched_clock and the one on which runtime was | ||
1348 | * issued. | ||
1349 | */ | ||
1350 | if ((s64)(expires - cfs_rq->runtime_expires) > 0) | ||
1351 | cfs_rq->runtime_expires = expires; | ||
1352 | |||
1353 | return cfs_rq->runtime_remaining > 0; | ||
1354 | } | ||
1355 | |||
1356 | /* | ||
1357 | * Note: This depends on the synchronization provided by sched_clock and the | ||
1358 | * fact that rq->clock snapshots this value. | ||
1359 | */ | ||
1360 | static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1361 | { | ||
1362 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
1363 | struct rq *rq = rq_of(cfs_rq); | ||
1364 | |||
1365 | /* if the deadline is ahead of our clock, nothing to do */ | ||
1366 | if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) | ||
1367 | return; | ||
1368 | |||
1369 | if (cfs_rq->runtime_remaining < 0) | ||
1370 | return; | ||
1371 | |||
1372 | /* | ||
1373 | * If the local deadline has passed we have to consider the | ||
1374 | * possibility that our sched_clock is 'fast' and the global deadline | ||
1375 | * has not truly expired. | ||
1376 | * | ||
1377 | * Fortunately we can check determine whether this the case by checking | ||
1378 | * whether the global deadline has advanced. | ||
1379 | */ | ||
1380 | |||
1381 | if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { | ||
1382 | /* extend local deadline, drift is bounded above by 2 ticks */ | ||
1383 | cfs_rq->runtime_expires += TICK_NSEC; | ||
1384 | } else { | ||
1385 | /* global deadline is ahead, expiration has passed */ | ||
1386 | cfs_rq->runtime_remaining = 0; | ||
1387 | } | ||
1388 | } | ||
1389 | |||
1390 | static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
1391 | unsigned long delta_exec) | ||
1392 | { | ||
1393 | /* dock delta_exec before expiring quota (as it could span periods) */ | ||
1394 | cfs_rq->runtime_remaining -= delta_exec; | ||
1395 | expire_cfs_rq_runtime(cfs_rq); | ||
1396 | |||
1397 | if (likely(cfs_rq->runtime_remaining > 0)) | ||
1398 | return; | ||
1399 | |||
1400 | /* | ||
1401 | * if we're unable to extend our runtime we resched so that the active | ||
1402 | * hierarchy can be throttled | ||
1403 | */ | ||
1404 | if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) | ||
1405 | resched_task(rq_of(cfs_rq)->curr); | ||
1406 | } | ||
1407 | |||
1408 | static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
1409 | unsigned long delta_exec) | ||
1410 | { | ||
1411 | if (!cfs_rq->runtime_enabled) | ||
1412 | return; | ||
1413 | |||
1414 | __account_cfs_rq_runtime(cfs_rq, delta_exec); | ||
1415 | } | ||
1416 | |||
1417 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) | ||
1418 | { | ||
1419 | return cfs_rq->throttled; | ||
1420 | } | ||
1421 | |||
1422 | /* check whether cfs_rq, or any parent, is throttled */ | ||
1423 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) | ||
1424 | { | ||
1425 | return cfs_rq->throttle_count; | ||
1426 | } | ||
1427 | |||
1428 | /* | ||
1429 | * Ensure that neither of the group entities corresponding to src_cpu or | ||
1430 | * dest_cpu are members of a throttled hierarchy when performing group | ||
1431 | * load-balance operations. | ||
1432 | */ | ||
1433 | static inline int throttled_lb_pair(struct task_group *tg, | ||
1434 | int src_cpu, int dest_cpu) | ||
1435 | { | ||
1436 | struct cfs_rq *src_cfs_rq, *dest_cfs_rq; | ||
1437 | |||
1438 | src_cfs_rq = tg->cfs_rq[src_cpu]; | ||
1439 | dest_cfs_rq = tg->cfs_rq[dest_cpu]; | ||
1440 | |||
1441 | return throttled_hierarchy(src_cfs_rq) || | ||
1442 | throttled_hierarchy(dest_cfs_rq); | ||
1443 | } | ||
1444 | |||
1445 | /* updated child weight may affect parent so we have to do this bottom up */ | ||
1446 | static int tg_unthrottle_up(struct task_group *tg, void *data) | ||
1447 | { | ||
1448 | struct rq *rq = data; | ||
1449 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; | ||
1450 | |||
1451 | cfs_rq->throttle_count--; | ||
1452 | #ifdef CONFIG_SMP | ||
1453 | if (!cfs_rq->throttle_count) { | ||
1454 | u64 delta = rq->clock_task - cfs_rq->load_stamp; | ||
1455 | |||
1456 | /* leaving throttled state, advance shares averaging windows */ | ||
1457 | cfs_rq->load_stamp += delta; | ||
1458 | cfs_rq->load_last += delta; | ||
1459 | |||
1460 | /* update entity weight now that we are on_rq again */ | ||
1461 | update_cfs_shares(cfs_rq); | ||
1462 | } | ||
1463 | #endif | ||
1464 | |||
1465 | return 0; | ||
1466 | } | ||
1467 | |||
1468 | static int tg_throttle_down(struct task_group *tg, void *data) | ||
1469 | { | ||
1470 | struct rq *rq = data; | ||
1471 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; | ||
1472 | |||
1473 | /* group is entering throttled state, record last load */ | ||
1474 | if (!cfs_rq->throttle_count) | ||
1475 | update_cfs_load(cfs_rq, 0); | ||
1476 | cfs_rq->throttle_count++; | ||
1477 | |||
1478 | return 0; | ||
1479 | } | ||
1480 | |||
1481 | static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | ||
1482 | { | ||
1483 | struct rq *rq = rq_of(cfs_rq); | ||
1484 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
1485 | struct sched_entity *se; | ||
1486 | long task_delta, dequeue = 1; | ||
1487 | |||
1488 | se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; | ||
1489 | |||
1490 | /* account load preceding throttle */ | ||
1491 | rcu_read_lock(); | ||
1492 | walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); | ||
1493 | rcu_read_unlock(); | ||
1494 | |||
1495 | task_delta = cfs_rq->h_nr_running; | ||
1496 | for_each_sched_entity(se) { | ||
1497 | struct cfs_rq *qcfs_rq = cfs_rq_of(se); | ||
1498 | /* throttled entity or throttle-on-deactivate */ | ||
1499 | if (!se->on_rq) | ||
1500 | break; | ||
1501 | |||
1502 | if (dequeue) | ||
1503 | dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); | ||
1504 | qcfs_rq->h_nr_running -= task_delta; | ||
1505 | |||
1506 | if (qcfs_rq->load.weight) | ||
1507 | dequeue = 0; | ||
1508 | } | ||
1509 | |||
1510 | if (!se) | ||
1511 | rq->nr_running -= task_delta; | ||
1512 | |||
1513 | cfs_rq->throttled = 1; | ||
1514 | cfs_rq->throttled_timestamp = rq->clock; | ||
1515 | raw_spin_lock(&cfs_b->lock); | ||
1516 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | ||
1517 | raw_spin_unlock(&cfs_b->lock); | ||
1518 | } | ||
1519 | |||
1520 | static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | ||
1521 | { | ||
1522 | struct rq *rq = rq_of(cfs_rq); | ||
1523 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
1524 | struct sched_entity *se; | ||
1525 | int enqueue = 1; | ||
1526 | long task_delta; | ||
1527 | |||
1528 | se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; | ||
1529 | |||
1530 | cfs_rq->throttled = 0; | ||
1531 | raw_spin_lock(&cfs_b->lock); | ||
1532 | cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; | ||
1533 | list_del_rcu(&cfs_rq->throttled_list); | ||
1534 | raw_spin_unlock(&cfs_b->lock); | ||
1535 | cfs_rq->throttled_timestamp = 0; | ||
1536 | |||
1537 | update_rq_clock(rq); | ||
1538 | /* update hierarchical throttle state */ | ||
1539 | walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); | ||
1540 | |||
1541 | if (!cfs_rq->load.weight) | ||
1542 | return; | ||
1543 | |||
1544 | task_delta = cfs_rq->h_nr_running; | ||
1545 | for_each_sched_entity(se) { | ||
1546 | if (se->on_rq) | ||
1547 | enqueue = 0; | ||
1548 | |||
1549 | cfs_rq = cfs_rq_of(se); | ||
1550 | if (enqueue) | ||
1551 | enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); | ||
1552 | cfs_rq->h_nr_running += task_delta; | ||
1553 | |||
1554 | if (cfs_rq_throttled(cfs_rq)) | ||
1555 | break; | ||
1556 | } | ||
1557 | |||
1558 | if (!se) | ||
1559 | rq->nr_running += task_delta; | ||
1560 | |||
1561 | /* determine whether we need to wake up potentially idle cpu */ | ||
1562 | if (rq->curr == rq->idle && rq->cfs.nr_running) | ||
1563 | resched_task(rq->curr); | ||
1564 | } | ||
1565 | |||
1566 | static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, | ||
1567 | u64 remaining, u64 expires) | ||
1568 | { | ||
1569 | struct cfs_rq *cfs_rq; | ||
1570 | u64 runtime = remaining; | ||
1571 | |||
1572 | rcu_read_lock(); | ||
1573 | list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, | ||
1574 | throttled_list) { | ||
1575 | struct rq *rq = rq_of(cfs_rq); | ||
1576 | |||
1577 | raw_spin_lock(&rq->lock); | ||
1578 | if (!cfs_rq_throttled(cfs_rq)) | ||
1579 | goto next; | ||
1580 | |||
1581 | runtime = -cfs_rq->runtime_remaining + 1; | ||
1582 | if (runtime > remaining) | ||
1583 | runtime = remaining; | ||
1584 | remaining -= runtime; | ||
1585 | |||
1586 | cfs_rq->runtime_remaining += runtime; | ||
1587 | cfs_rq->runtime_expires = expires; | ||
1588 | |||
1589 | /* we check whether we're throttled above */ | ||
1590 | if (cfs_rq->runtime_remaining > 0) | ||
1591 | unthrottle_cfs_rq(cfs_rq); | ||
1592 | |||
1593 | next: | ||
1594 | raw_spin_unlock(&rq->lock); | ||
1595 | |||
1596 | if (!remaining) | ||
1597 | break; | ||
1598 | } | ||
1599 | rcu_read_unlock(); | ||
1600 | |||
1601 | return remaining; | ||
1602 | } | ||
1603 | |||
1604 | /* | ||
1605 | * Responsible for refilling a task_group's bandwidth and unthrottling its | ||
1606 | * cfs_rqs as appropriate. If there has been no activity within the last | ||
1607 | * period the timer is deactivated until scheduling resumes; cfs_b->idle is | ||
1608 | * used to track this state. | ||
1609 | */ | ||
1610 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | ||
1611 | { | ||
1612 | u64 runtime, runtime_expires; | ||
1613 | int idle = 1, throttled; | ||
1614 | |||
1615 | raw_spin_lock(&cfs_b->lock); | ||
1616 | /* no need to continue the timer with no bandwidth constraint */ | ||
1617 | if (cfs_b->quota == RUNTIME_INF) | ||
1618 | goto out_unlock; | ||
1619 | |||
1620 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); | ||
1621 | /* idle depends on !throttled (for the case of a large deficit) */ | ||
1622 | idle = cfs_b->idle && !throttled; | ||
1623 | cfs_b->nr_periods += overrun; | ||
1624 | |||
1625 | /* if we're going inactive then everything else can be deferred */ | ||
1626 | if (idle) | ||
1627 | goto out_unlock; | ||
1628 | |||
1629 | __refill_cfs_bandwidth_runtime(cfs_b); | ||
1630 | |||
1631 | if (!throttled) { | ||
1632 | /* mark as potentially idle for the upcoming period */ | ||
1633 | cfs_b->idle = 1; | ||
1634 | goto out_unlock; | ||
1635 | } | ||
1636 | |||
1637 | /* account preceding periods in which throttling occurred */ | ||
1638 | cfs_b->nr_throttled += overrun; | ||
1639 | |||
1640 | /* | ||
1641 | * There are throttled entities so we must first use the new bandwidth | ||
1642 | * to unthrottle them before making it generally available. This | ||
1643 | * ensures that all existing debts will be paid before a new cfs_rq is | ||
1644 | * allowed to run. | ||
1645 | */ | ||
1646 | runtime = cfs_b->runtime; | ||
1647 | runtime_expires = cfs_b->runtime_expires; | ||
1648 | cfs_b->runtime = 0; | ||
1649 | |||
1650 | /* | ||
1651 | * This check is repeated as we are holding onto the new bandwidth | ||
1652 | * while we unthrottle. This can potentially race with an unthrottled | ||
1653 | * group trying to acquire new bandwidth from the global pool. | ||
1654 | */ | ||
1655 | while (throttled && runtime > 0) { | ||
1656 | raw_spin_unlock(&cfs_b->lock); | ||
1657 | /* we can't nest cfs_b->lock while distributing bandwidth */ | ||
1658 | runtime = distribute_cfs_runtime(cfs_b, runtime, | ||
1659 | runtime_expires); | ||
1660 | raw_spin_lock(&cfs_b->lock); | ||
1661 | |||
1662 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); | ||
1663 | } | ||
1664 | |||
1665 | /* return (any) remaining runtime */ | ||
1666 | cfs_b->runtime = runtime; | ||
1667 | /* | ||
1668 | * While we are ensured activity in the period following an | ||
1669 | * unthrottle, this also covers the case in which the new bandwidth is | ||
1670 | * insufficient to cover the existing bandwidth deficit. (Forcing the | ||
1671 | * timer to remain active while there are any throttled entities.) | ||
1672 | */ | ||
1673 | cfs_b->idle = 0; | ||
1674 | out_unlock: | ||
1675 | if (idle) | ||
1676 | cfs_b->timer_active = 0; | ||
1677 | raw_spin_unlock(&cfs_b->lock); | ||
1678 | |||
1679 | return idle; | ||
1680 | } | ||
1681 | |||
1682 | /* a cfs_rq won't donate quota below this amount */ | ||
1683 | static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC; | ||
1684 | /* minimum remaining period time to redistribute slack quota */ | ||
1685 | static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; | ||
1686 | /* how long we wait to gather additional slack before distributing */ | ||
1687 | static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; | ||
1688 | |||
1689 | /* are we near the end of the current quota period? */ | ||
1690 | static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) | ||
1691 | { | ||
1692 | struct hrtimer *refresh_timer = &cfs_b->period_timer; | ||
1693 | u64 remaining; | ||
1694 | |||
1695 | /* if the call-back is running a quota refresh is already occurring */ | ||
1696 | if (hrtimer_callback_running(refresh_timer)) | ||
1697 | return 1; | ||
1698 | |||
1699 | /* is a quota refresh about to occur? */ | ||
1700 | remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); | ||
1701 | if (remaining < min_expire) | ||
1702 | return 1; | ||
1703 | |||
1704 | return 0; | ||
1705 | } | ||
1706 | |||
1707 | static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) | ||
1708 | { | ||
1709 | u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration; | ||
1710 | |||
1711 | /* if there's a quota refresh soon don't bother with slack */ | ||
1712 | if (runtime_refresh_within(cfs_b, min_left)) | ||
1713 | return; | ||
1714 | |||
1715 | start_bandwidth_timer(&cfs_b->slack_timer, | ||
1716 | ns_to_ktime(cfs_bandwidth_slack_period)); | ||
1717 | } | ||
1718 | |||
1719 | /* we know any runtime found here is valid as update_curr() precedes return */ | ||
1720 | static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1721 | { | ||
1722 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
1723 | s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime; | ||
1724 | |||
1725 | if (slack_runtime <= 0) | ||
1726 | return; | ||
1727 | |||
1728 | raw_spin_lock(&cfs_b->lock); | ||
1729 | if (cfs_b->quota != RUNTIME_INF && | ||
1730 | cfs_rq->runtime_expires == cfs_b->runtime_expires) { | ||
1731 | cfs_b->runtime += slack_runtime; | ||
1732 | |||
1733 | /* we are under rq->lock, defer unthrottling using a timer */ | ||
1734 | if (cfs_b->runtime > sched_cfs_bandwidth_slice() && | ||
1735 | !list_empty(&cfs_b->throttled_cfs_rq)) | ||
1736 | start_cfs_slack_bandwidth(cfs_b); | ||
1737 | } | ||
1738 | raw_spin_unlock(&cfs_b->lock); | ||
1739 | |||
1740 | /* even if it's not valid for return we don't want to try again */ | ||
1741 | cfs_rq->runtime_remaining -= slack_runtime; | ||
1742 | } | ||
1743 | |||
1744 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1745 | { | ||
1746 | if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running) | ||
1747 | return; | ||
1748 | |||
1749 | __return_cfs_rq_runtime(cfs_rq); | ||
1750 | } | ||
1751 | |||
1752 | /* | ||
1753 | * This is done with a timer (instead of inline with bandwidth return) since | ||
1754 | * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs. | ||
1755 | */ | ||
1756 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | ||
1757 | { | ||
1758 | u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); | ||
1759 | u64 expires; | ||
1760 | |||
1761 | /* confirm we're still not at a refresh boundary */ | ||
1762 | if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) | ||
1763 | return; | ||
1764 | |||
1765 | raw_spin_lock(&cfs_b->lock); | ||
1766 | if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { | ||
1767 | runtime = cfs_b->runtime; | ||
1768 | cfs_b->runtime = 0; | ||
1769 | } | ||
1770 | expires = cfs_b->runtime_expires; | ||
1771 | raw_spin_unlock(&cfs_b->lock); | ||
1772 | |||
1773 | if (!runtime) | ||
1774 | return; | ||
1775 | |||
1776 | runtime = distribute_cfs_runtime(cfs_b, runtime, expires); | ||
1777 | |||
1778 | raw_spin_lock(&cfs_b->lock); | ||
1779 | if (expires == cfs_b->runtime_expires) | ||
1780 | cfs_b->runtime = runtime; | ||
1781 | raw_spin_unlock(&cfs_b->lock); | ||
1782 | } | ||
1783 | |||
1784 | /* | ||
1785 | * When a group wakes up we want to make sure that its quota is not already | ||
1786 | * expired/exceeded, otherwise it may be allowed to steal additional ticks of | ||
1787 | * runtime as update_curr() throttling can not not trigger until it's on-rq. | ||
1788 | */ | ||
1789 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) | ||
1790 | { | ||
1791 | /* an active group must be handled by the update_curr()->put() path */ | ||
1792 | if (!cfs_rq->runtime_enabled || cfs_rq->curr) | ||
1793 | return; | ||
1794 | |||
1795 | /* ensure the group is not already throttled */ | ||
1796 | if (cfs_rq_throttled(cfs_rq)) | ||
1797 | return; | ||
1798 | |||
1799 | /* update runtime allocation */ | ||
1800 | account_cfs_rq_runtime(cfs_rq, 0); | ||
1801 | if (cfs_rq->runtime_remaining <= 0) | ||
1802 | throttle_cfs_rq(cfs_rq); | ||
1803 | } | ||
1804 | |||
1805 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ | ||
1806 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1807 | { | ||
1808 | if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) | ||
1809 | return; | ||
1810 | |||
1811 | /* | ||
1812 | * it's possible for a throttled entity to be forced into a running | ||
1813 | * state (e.g. set_curr_task), in this case we're finished. | ||
1814 | */ | ||
1815 | if (cfs_rq_throttled(cfs_rq)) | ||
1816 | return; | ||
1817 | |||
1818 | throttle_cfs_rq(cfs_rq); | ||
1819 | } | ||
1820 | #else | ||
1821 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
1822 | unsigned long delta_exec) {} | ||
1823 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
1824 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} | ||
1825 | static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
1826 | |||
1827 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) | ||
1828 | { | ||
1829 | return 0; | ||
1830 | } | ||
1831 | |||
1832 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) | ||
1833 | { | ||
1834 | return 0; | ||
1835 | } | ||
1836 | |||
1837 | static inline int throttled_lb_pair(struct task_group *tg, | ||
1838 | int src_cpu, int dest_cpu) | ||
1839 | { | ||
1840 | return 0; | ||
1841 | } | ||
1842 | #endif | ||
1843 | |||
1240 | /************************************************** | 1844 | /************************************************** |
1241 | * CFS operations on tasks: | 1845 | * CFS operations on tasks: |
1242 | */ | 1846 | */ |
@@ -1313,16 +1917,33 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1313 | break; | 1917 | break; |
1314 | cfs_rq = cfs_rq_of(se); | 1918 | cfs_rq = cfs_rq_of(se); |
1315 | enqueue_entity(cfs_rq, se, flags); | 1919 | enqueue_entity(cfs_rq, se, flags); |
1920 | |||
1921 | /* | ||
1922 | * end evaluation on encountering a throttled cfs_rq | ||
1923 | * | ||
1924 | * note: in the case of encountering a throttled cfs_rq we will | ||
1925 | * post the final h_nr_running increment below. | ||
1926 | */ | ||
1927 | if (cfs_rq_throttled(cfs_rq)) | ||
1928 | break; | ||
1929 | cfs_rq->h_nr_running++; | ||
1930 | |||
1316 | flags = ENQUEUE_WAKEUP; | 1931 | flags = ENQUEUE_WAKEUP; |
1317 | } | 1932 | } |
1318 | 1933 | ||
1319 | for_each_sched_entity(se) { | 1934 | for_each_sched_entity(se) { |
1320 | cfs_rq = cfs_rq_of(se); | 1935 | cfs_rq = cfs_rq_of(se); |
1936 | cfs_rq->h_nr_running++; | ||
1937 | |||
1938 | if (cfs_rq_throttled(cfs_rq)) | ||
1939 | break; | ||
1321 | 1940 | ||
1322 | update_cfs_load(cfs_rq, 0); | 1941 | update_cfs_load(cfs_rq, 0); |
1323 | update_cfs_shares(cfs_rq); | 1942 | update_cfs_shares(cfs_rq); |
1324 | } | 1943 | } |
1325 | 1944 | ||
1945 | if (!se) | ||
1946 | inc_nr_running(rq); | ||
1326 | hrtick_update(rq); | 1947 | hrtick_update(rq); |
1327 | } | 1948 | } |
1328 | 1949 | ||
@@ -1343,6 +1964,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1343 | cfs_rq = cfs_rq_of(se); | 1964 | cfs_rq = cfs_rq_of(se); |
1344 | dequeue_entity(cfs_rq, se, flags); | 1965 | dequeue_entity(cfs_rq, se, flags); |
1345 | 1966 | ||
1967 | /* | ||
1968 | * end evaluation on encountering a throttled cfs_rq | ||
1969 | * | ||
1970 | * note: in the case of encountering a throttled cfs_rq we will | ||
1971 | * post the final h_nr_running decrement below. | ||
1972 | */ | ||
1973 | if (cfs_rq_throttled(cfs_rq)) | ||
1974 | break; | ||
1975 | cfs_rq->h_nr_running--; | ||
1976 | |||
1346 | /* Don't dequeue parent if it has other entities besides us */ | 1977 | /* Don't dequeue parent if it has other entities besides us */ |
1347 | if (cfs_rq->load.weight) { | 1978 | if (cfs_rq->load.weight) { |
1348 | /* | 1979 | /* |
@@ -1361,11 +1992,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1361 | 1992 | ||
1362 | for_each_sched_entity(se) { | 1993 | for_each_sched_entity(se) { |
1363 | cfs_rq = cfs_rq_of(se); | 1994 | cfs_rq = cfs_rq_of(se); |
1995 | cfs_rq->h_nr_running--; | ||
1996 | |||
1997 | if (cfs_rq_throttled(cfs_rq)) | ||
1998 | break; | ||
1364 | 1999 | ||
1365 | update_cfs_load(cfs_rq, 0); | 2000 | update_cfs_load(cfs_rq, 0); |
1366 | update_cfs_shares(cfs_rq); | 2001 | update_cfs_shares(cfs_rq); |
1367 | } | 2002 | } |
1368 | 2003 | ||
2004 | if (!se) | ||
2005 | dec_nr_running(rq); | ||
1369 | hrtick_update(rq); | 2006 | hrtick_update(rq); |
1370 | } | 2007 | } |
1371 | 2008 | ||
@@ -1434,7 +2071,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
1434 | 2071 | ||
1435 | return wl; | 2072 | return wl; |
1436 | } | 2073 | } |
1437 | |||
1438 | #else | 2074 | #else |
1439 | 2075 | ||
1440 | static inline unsigned long effective_load(struct task_group *tg, int cpu, | 2076 | static inline unsigned long effective_load(struct task_group *tg, int cpu, |
@@ -1875,6 +2511,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1875 | if (unlikely(se == pse)) | 2511 | if (unlikely(se == pse)) |
1876 | return; | 2512 | return; |
1877 | 2513 | ||
2514 | /* | ||
2515 | * This is possible from callers such as pull_task(), in which we | ||
2516 | * unconditionally check_prempt_curr() after an enqueue (which may have | ||
2517 | * lead to a throttle). This both saves work and prevents false | ||
2518 | * next-buddy nomination below. | ||
2519 | */ | ||
2520 | if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) | ||
2521 | return; | ||
2522 | |||
1878 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { | 2523 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { |
1879 | set_next_buddy(pse); | 2524 | set_next_buddy(pse); |
1880 | next_buddy_marked = 1; | 2525 | next_buddy_marked = 1; |
@@ -1883,6 +2528,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1883 | /* | 2528 | /* |
1884 | * We can come here with TIF_NEED_RESCHED already set from new task | 2529 | * We can come here with TIF_NEED_RESCHED already set from new task |
1885 | * wake up path. | 2530 | * wake up path. |
2531 | * | ||
2532 | * Note: this also catches the edge-case of curr being in a throttled | ||
2533 | * group (e.g. via set_curr_task), since update_curr() (in the | ||
2534 | * enqueue of curr) will have resulted in resched being set. This | ||
2535 | * prevents us from potentially nominating it as a false LAST_BUDDY | ||
2536 | * below. | ||
1886 | */ | 2537 | */ |
1887 | if (test_tsk_need_resched(curr)) | 2538 | if (test_tsk_need_resched(curr)) |
1888 | return; | 2539 | return; |
@@ -1899,10 +2550,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1899 | if (unlikely(p->policy != SCHED_NORMAL)) | 2550 | if (unlikely(p->policy != SCHED_NORMAL)) |
1900 | return; | 2551 | return; |
1901 | 2552 | ||
1902 | |||
1903 | if (!sched_feat(WAKEUP_PREEMPT)) | ||
1904 | return; | ||
1905 | |||
1906 | find_matching_se(&se, &pse); | 2553 | find_matching_se(&se, &pse); |
1907 | update_curr(cfs_rq_of(se)); | 2554 | update_curr(cfs_rq_of(se)); |
1908 | BUG_ON(!pse); | 2555 | BUG_ON(!pse); |
@@ -2005,7 +2652,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
2005 | { | 2652 | { |
2006 | struct sched_entity *se = &p->se; | 2653 | struct sched_entity *se = &p->se; |
2007 | 2654 | ||
2008 | if (!se->on_rq) | 2655 | /* throttled hierarchies are not runnable */ |
2656 | if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) | ||
2009 | return false; | 2657 | return false; |
2010 | 2658 | ||
2011 | /* Tell the scheduler that we'd really like pse to run next. */ | 2659 | /* Tell the scheduler that we'd really like pse to run next. */ |
@@ -2102,6 +2750,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2102 | 2750 | ||
2103 | for_each_leaf_cfs_rq(busiest, cfs_rq) { | 2751 | for_each_leaf_cfs_rq(busiest, cfs_rq) { |
2104 | list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { | 2752 | list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { |
2753 | if (throttled_lb_pair(task_group(p), | ||
2754 | busiest->cpu, this_cpu)) | ||
2755 | break; | ||
2105 | 2756 | ||
2106 | if (!can_migrate_task(p, busiest, this_cpu, | 2757 | if (!can_migrate_task(p, busiest, this_cpu, |
2107 | sd, idle, &pinned)) | 2758 | sd, idle, &pinned)) |
@@ -2217,8 +2868,13 @@ static void update_shares(int cpu) | |||
2217 | * Iterates the task_group tree in a bottom up fashion, see | 2868 | * Iterates the task_group tree in a bottom up fashion, see |
2218 | * list_add_leaf_cfs_rq() for details. | 2869 | * list_add_leaf_cfs_rq() for details. |
2219 | */ | 2870 | */ |
2220 | for_each_leaf_cfs_rq(rq, cfs_rq) | 2871 | for_each_leaf_cfs_rq(rq, cfs_rq) { |
2872 | /* throttled entities do not contribute to load */ | ||
2873 | if (throttled_hierarchy(cfs_rq)) | ||
2874 | continue; | ||
2875 | |||
2221 | update_shares_cpu(cfs_rq->tg, cpu); | 2876 | update_shares_cpu(cfs_rq->tg, cpu); |
2877 | } | ||
2222 | rcu_read_unlock(); | 2878 | rcu_read_unlock(); |
2223 | } | 2879 | } |
2224 | 2880 | ||
@@ -2268,9 +2924,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2268 | u64 rem_load, moved_load; | 2924 | u64 rem_load, moved_load; |
2269 | 2925 | ||
2270 | /* | 2926 | /* |
2271 | * empty group | 2927 | * empty group or part of a throttled hierarchy |
2272 | */ | 2928 | */ |
2273 | if (!busiest_cfs_rq->task_weight) | 2929 | if (!busiest_cfs_rq->task_weight || |
2930 | throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) | ||
2274 | continue; | 2931 | continue; |
2275 | 2932 | ||
2276 | rem_load = (u64)rem_load_move * busiest_weight; | 2933 | rem_load = (u64)rem_load_move * busiest_weight; |
@@ -3667,7 +4324,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | |||
3667 | struct sched_domain *sd; | 4324 | struct sched_domain *sd; |
3668 | 4325 | ||
3669 | for_each_domain(cpu, sd) | 4326 | for_each_domain(cpu, sd) |
3670 | if (sd && (sd->flags & flag)) | 4327 | if (sd->flags & flag) |
3671 | break; | 4328 | break; |
3672 | 4329 | ||
3673 | return sd; | 4330 | return sd; |
@@ -4251,8 +4908,13 @@ static void set_curr_task_fair(struct rq *rq) | |||
4251 | { | 4908 | { |
4252 | struct sched_entity *se = &rq->curr->se; | 4909 | struct sched_entity *se = &rq->curr->se; |
4253 | 4910 | ||
4254 | for_each_sched_entity(se) | 4911 | for_each_sched_entity(se) { |
4255 | set_next_entity(cfs_rq_of(se), se); | 4912 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
4913 | |||
4914 | set_next_entity(cfs_rq, se); | ||
4915 | /* ensure bandwidth has been allocated on our new cfs_rq */ | ||
4916 | account_cfs_rq_runtime(cfs_rq, 0); | ||
4917 | } | ||
4256 | } | 4918 | } |
4257 | 4919 | ||
4258 | #ifdef CONFIG_FAIR_GROUP_SCHED | 4920 | #ifdef CONFIG_FAIR_GROUP_SCHED |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 2e74677cb040..efa0a7b75dde 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
@@ -12,11 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) | |||
12 | SCHED_FEAT(START_DEBIT, 1) | 12 | SCHED_FEAT(START_DEBIT, 1) |
13 | 13 | ||
14 | /* | 14 | /* |
15 | * Should wakeups try to preempt running tasks. | ||
16 | */ | ||
17 | SCHED_FEAT(WAKEUP_PREEMPT, 1) | ||
18 | |||
19 | /* | ||
20 | * Based on load and program behaviour, see if it makes sense to place | 15 | * Based on load and program behaviour, see if it makes sense to place |
21 | * a newly woken task on the same cpu as the task that woke it -- | 16 | * a newly woken task on the same cpu as the task that woke it -- |
22 | * improve cache locality. Typically used with SYNC wakeups as | 17 | * improve cache locality. Typically used with SYNC wakeups as |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index af1177858be3..0cc188cf7664 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -124,21 +124,33 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
124 | update_rt_migration(rt_rq); | 124 | update_rt_migration(rt_rq); |
125 | } | 125 | } |
126 | 126 | ||
127 | static inline int has_pushable_tasks(struct rq *rq) | ||
128 | { | ||
129 | return !plist_head_empty(&rq->rt.pushable_tasks); | ||
130 | } | ||
131 | |||
127 | static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) | 132 | static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) |
128 | { | 133 | { |
129 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); | 134 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); |
130 | plist_node_init(&p->pushable_tasks, p->prio); | 135 | plist_node_init(&p->pushable_tasks, p->prio); |
131 | plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); | 136 | plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); |
137 | |||
138 | /* Update the highest prio pushable task */ | ||
139 | if (p->prio < rq->rt.highest_prio.next) | ||
140 | rq->rt.highest_prio.next = p->prio; | ||
132 | } | 141 | } |
133 | 142 | ||
134 | static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) | 143 | static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) |
135 | { | 144 | { |
136 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); | 145 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); |
137 | } | ||
138 | 146 | ||
139 | static inline int has_pushable_tasks(struct rq *rq) | 147 | /* Update the new highest prio pushable task */ |
140 | { | 148 | if (has_pushable_tasks(rq)) { |
141 | return !plist_head_empty(&rq->rt.pushable_tasks); | 149 | p = plist_first_entry(&rq->rt.pushable_tasks, |
150 | struct task_struct, pushable_tasks); | ||
151 | rq->rt.highest_prio.next = p->prio; | ||
152 | } else | ||
153 | rq->rt.highest_prio.next = MAX_RT_PRIO; | ||
142 | } | 154 | } |
143 | 155 | ||
144 | #else | 156 | #else |
@@ -698,47 +710,13 @@ static void update_curr_rt(struct rq *rq) | |||
698 | 710 | ||
699 | #if defined CONFIG_SMP | 711 | #if defined CONFIG_SMP |
700 | 712 | ||
701 | static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu); | ||
702 | |||
703 | static inline int next_prio(struct rq *rq) | ||
704 | { | ||
705 | struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu); | ||
706 | |||
707 | if (next && rt_prio(next->prio)) | ||
708 | return next->prio; | ||
709 | else | ||
710 | return MAX_RT_PRIO; | ||
711 | } | ||
712 | |||
713 | static void | 713 | static void |
714 | inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) | 714 | inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) |
715 | { | 715 | { |
716 | struct rq *rq = rq_of_rt_rq(rt_rq); | 716 | struct rq *rq = rq_of_rt_rq(rt_rq); |
717 | 717 | ||
718 | if (prio < prev_prio) { | 718 | if (rq->online && prio < prev_prio) |
719 | 719 | cpupri_set(&rq->rd->cpupri, rq->cpu, prio); | |
720 | /* | ||
721 | * If the new task is higher in priority than anything on the | ||
722 | * run-queue, we know that the previous high becomes our | ||
723 | * next-highest. | ||
724 | */ | ||
725 | rt_rq->highest_prio.next = prev_prio; | ||
726 | |||
727 | if (rq->online) | ||
728 | cpupri_set(&rq->rd->cpupri, rq->cpu, prio); | ||
729 | |||
730 | } else if (prio == rt_rq->highest_prio.curr) | ||
731 | /* | ||
732 | * If the next task is equal in priority to the highest on | ||
733 | * the run-queue, then we implicitly know that the next highest | ||
734 | * task cannot be any lower than current | ||
735 | */ | ||
736 | rt_rq->highest_prio.next = prio; | ||
737 | else if (prio < rt_rq->highest_prio.next) | ||
738 | /* | ||
739 | * Otherwise, we need to recompute next-highest | ||
740 | */ | ||
741 | rt_rq->highest_prio.next = next_prio(rq); | ||
742 | } | 720 | } |
743 | 721 | ||
744 | static void | 722 | static void |
@@ -746,9 +724,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) | |||
746 | { | 724 | { |
747 | struct rq *rq = rq_of_rt_rq(rt_rq); | 725 | struct rq *rq = rq_of_rt_rq(rt_rq); |
748 | 726 | ||
749 | if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next)) | ||
750 | rt_rq->highest_prio.next = next_prio(rq); | ||
751 | |||
752 | if (rq->online && rt_rq->highest_prio.curr != prev_prio) | 727 | if (rq->online && rt_rq->highest_prio.curr != prev_prio) |
753 | cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); | 728 | cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); |
754 | } | 729 | } |
@@ -961,6 +936,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
961 | 936 | ||
962 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) | 937 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) |
963 | enqueue_pushable_task(rq, p); | 938 | enqueue_pushable_task(rq, p); |
939 | |||
940 | inc_nr_running(rq); | ||
964 | } | 941 | } |
965 | 942 | ||
966 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) | 943 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) |
@@ -971,6 +948,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
971 | dequeue_rt_entity(rt_se); | 948 | dequeue_rt_entity(rt_se); |
972 | 949 | ||
973 | dequeue_pushable_task(rq, p); | 950 | dequeue_pushable_task(rq, p); |
951 | |||
952 | dec_nr_running(rq); | ||
974 | } | 953 | } |
975 | 954 | ||
976 | /* | 955 | /* |
@@ -1017,10 +996,12 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1017 | struct rq *rq; | 996 | struct rq *rq; |
1018 | int cpu; | 997 | int cpu; |
1019 | 998 | ||
1020 | if (sd_flag != SD_BALANCE_WAKE) | ||
1021 | return smp_processor_id(); | ||
1022 | |||
1023 | cpu = task_cpu(p); | 999 | cpu = task_cpu(p); |
1000 | |||
1001 | /* For anything but wake ups, just return the task_cpu */ | ||
1002 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) | ||
1003 | goto out; | ||
1004 | |||
1024 | rq = cpu_rq(cpu); | 1005 | rq = cpu_rq(cpu); |
1025 | 1006 | ||
1026 | rcu_read_lock(); | 1007 | rcu_read_lock(); |
@@ -1059,6 +1040,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1059 | } | 1040 | } |
1060 | rcu_read_unlock(); | 1041 | rcu_read_unlock(); |
1061 | 1042 | ||
1043 | out: | ||
1062 | return cpu; | 1044 | return cpu; |
1063 | } | 1045 | } |
1064 | 1046 | ||
@@ -1178,7 +1160,6 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) | |||
1178 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | 1160 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) |
1179 | { | 1161 | { |
1180 | update_curr_rt(rq); | 1162 | update_curr_rt(rq); |
1181 | p->se.exec_start = 0; | ||
1182 | 1163 | ||
1183 | /* | 1164 | /* |
1184 | * The previous task needs to be made eligible for pushing | 1165 | * The previous task needs to be made eligible for pushing |
@@ -1394,6 +1375,7 @@ static int push_rt_task(struct rq *rq) | |||
1394 | { | 1375 | { |
1395 | struct task_struct *next_task; | 1376 | struct task_struct *next_task; |
1396 | struct rq *lowest_rq; | 1377 | struct rq *lowest_rq; |
1378 | int ret = 0; | ||
1397 | 1379 | ||
1398 | if (!rq->rt.overloaded) | 1380 | if (!rq->rt.overloaded) |
1399 | return 0; | 1381 | return 0; |
@@ -1426,7 +1408,7 @@ retry: | |||
1426 | if (!lowest_rq) { | 1408 | if (!lowest_rq) { |
1427 | struct task_struct *task; | 1409 | struct task_struct *task; |
1428 | /* | 1410 | /* |
1429 | * find lock_lowest_rq releases rq->lock | 1411 | * find_lock_lowest_rq releases rq->lock |
1430 | * so it is possible that next_task has migrated. | 1412 | * so it is possible that next_task has migrated. |
1431 | * | 1413 | * |
1432 | * We need to make sure that the task is still on the same | 1414 | * We need to make sure that the task is still on the same |
@@ -1436,12 +1418,11 @@ retry: | |||
1436 | task = pick_next_pushable_task(rq); | 1418 | task = pick_next_pushable_task(rq); |
1437 | if (task_cpu(next_task) == rq->cpu && task == next_task) { | 1419 | if (task_cpu(next_task) == rq->cpu && task == next_task) { |
1438 | /* | 1420 | /* |
1439 | * If we get here, the task hasn't moved at all, but | 1421 | * The task hasn't migrated, and is still the next |
1440 | * it has failed to push. We will not try again, | 1422 | * eligible task, but we failed to find a run-queue |
1441 | * since the other cpus will pull from us when they | 1423 | * to push it to. Do not retry in this case, since |
1442 | * are ready. | 1424 | * other cpus will pull from us when ready. |
1443 | */ | 1425 | */ |
1444 | dequeue_pushable_task(rq, next_task); | ||
1445 | goto out; | 1426 | goto out; |
1446 | } | 1427 | } |
1447 | 1428 | ||
@@ -1460,6 +1441,7 @@ retry: | |||
1460 | deactivate_task(rq, next_task, 0); | 1441 | deactivate_task(rq, next_task, 0); |
1461 | set_task_cpu(next_task, lowest_rq->cpu); | 1442 | set_task_cpu(next_task, lowest_rq->cpu); |
1462 | activate_task(lowest_rq, next_task, 0); | 1443 | activate_task(lowest_rq, next_task, 0); |
1444 | ret = 1; | ||
1463 | 1445 | ||
1464 | resched_task(lowest_rq->curr); | 1446 | resched_task(lowest_rq->curr); |
1465 | 1447 | ||
@@ -1468,7 +1450,7 @@ retry: | |||
1468 | out: | 1450 | out: |
1469 | put_task_struct(next_task); | 1451 | put_task_struct(next_task); |
1470 | 1452 | ||
1471 | return 1; | 1453 | return ret; |
1472 | } | 1454 | } |
1473 | 1455 | ||
1474 | static void push_rt_tasks(struct rq *rq) | 1456 | static void push_rt_tasks(struct rq *rq) |
@@ -1863,4 +1845,3 @@ static void print_rt_stats(struct seq_file *m, int cpu) | |||
1863 | rcu_read_unlock(); | 1845 | rcu_read_unlock(); |
1864 | } | 1846 | } |
1865 | #endif /* CONFIG_SCHED_DEBUG */ | 1847 | #endif /* CONFIG_SCHED_DEBUG */ |
1866 | |||
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 6f437632afab..8b44e7fa7fb3 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c | |||
@@ -34,11 +34,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) | |||
34 | static void | 34 | static void |
35 | enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) | 35 | enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) |
36 | { | 36 | { |
37 | inc_nr_running(rq); | ||
37 | } | 38 | } |
38 | 39 | ||
39 | static void | 40 | static void |
40 | dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) | 41 | dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) |
41 | { | 42 | { |
43 | dec_nr_running(rq); | ||
42 | } | 44 | } |
43 | 45 | ||
44 | static void yield_task_stop(struct rq *rq) | 46 | static void yield_task_stop(struct rq *rq) |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 11d65b531e50..2d2ecdcc8cdb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -379,6 +379,16 @@ static struct ctl_table kern_table[] = { | |||
379 | .extra2 = &one, | 379 | .extra2 = &one, |
380 | }, | 380 | }, |
381 | #endif | 381 | #endif |
382 | #ifdef CONFIG_CFS_BANDWIDTH | ||
383 | { | ||
384 | .procname = "sched_cfs_bandwidth_slice_us", | ||
385 | .data = &sysctl_sched_cfs_bandwidth_slice, | ||
386 | .maxlen = sizeof(unsigned int), | ||
387 | .mode = 0644, | ||
388 | .proc_handler = proc_dointvec_minmax, | ||
389 | .extra1 = &one, | ||
390 | }, | ||
391 | #endif | ||
382 | #ifdef CONFIG_PROVE_LOCKING | 392 | #ifdef CONFIG_PROVE_LOCKING |
383 | { | 393 | { |
384 | .procname = "prove_locking", | 394 | .procname = "prove_locking", |