diff options
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 811 |
1 files changed, 328 insertions, 483 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 67c67a87146e..a17b785d7000 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -25,22 +25,26 @@ | |||
25 | * (default: 20ms, units: nanoseconds) | 25 | * (default: 20ms, units: nanoseconds) |
26 | * | 26 | * |
27 | * NOTE: this latency value is not the same as the concept of | 27 | * NOTE: this latency value is not the same as the concept of |
28 | * 'timeslice length' - timeslices in CFS are of variable length. | 28 | * 'timeslice length' - timeslices in CFS are of variable length |
29 | * (to see the precise effective timeslice length of your workload, | 29 | * and have no persistent notion like in traditional, time-slice |
30 | * run vmstat and monitor the context-switches field) | 30 | * based scheduling concepts. |
31 | * | 31 | * |
32 | * On SMP systems the value of this is multiplied by the log2 of the | 32 | * (to see the precise effective timeslice length of your workload, |
33 | * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way | 33 | * run vmstat and monitor the context-switches (cs) field) |
34 | * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) | ||
35 | * Targeted preemption latency for CPU-bound tasks: | ||
36 | */ | 34 | */ |
37 | unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; | 35 | const_debug unsigned int sysctl_sched_latency = 20000000ULL; |
36 | |||
37 | /* | ||
38 | * After fork, child runs first. (default) If set to 0 then | ||
39 | * parent will (try to) run first. | ||
40 | */ | ||
41 | const_debug unsigned int sysctl_sched_child_runs_first = 1; | ||
38 | 42 | ||
39 | /* | 43 | /* |
40 | * Minimal preemption granularity for CPU-bound tasks: | 44 | * Minimal preemption granularity for CPU-bound tasks: |
41 | * (default: 2 msec, units: nanoseconds) | 45 | * (default: 2 msec, units: nanoseconds) |
42 | */ | 46 | */ |
43 | unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; | 47 | const_debug unsigned int sysctl_sched_nr_latency = 20; |
44 | 48 | ||
45 | /* | 49 | /* |
46 | * sys_sched_yield() compat mode | 50 | * sys_sched_yield() compat mode |
@@ -52,52 +56,25 @@ unsigned int __read_mostly sysctl_sched_compat_yield; | |||
52 | 56 | ||
53 | /* | 57 | /* |
54 | * SCHED_BATCH wake-up granularity. | 58 | * SCHED_BATCH wake-up granularity. |
55 | * (default: 25 msec, units: nanoseconds) | 59 | * (default: 10 msec, units: nanoseconds) |
56 | * | 60 | * |
57 | * This option delays the preemption effects of decoupled workloads | 61 | * This option delays the preemption effects of decoupled workloads |
58 | * and reduces their over-scheduling. Synchronous workloads will still | 62 | * and reduces their over-scheduling. Synchronous workloads will still |
59 | * have immediate wakeup/sleep latencies. | 63 | * have immediate wakeup/sleep latencies. |
60 | */ | 64 | */ |
61 | unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL; | 65 | const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; |
62 | 66 | ||
63 | /* | 67 | /* |
64 | * SCHED_OTHER wake-up granularity. | 68 | * SCHED_OTHER wake-up granularity. |
65 | * (default: 1 msec, units: nanoseconds) | 69 | * (default: 10 msec, units: nanoseconds) |
66 | * | 70 | * |
67 | * This option delays the preemption effects of decoupled workloads | 71 | * This option delays the preemption effects of decoupled workloads |
68 | * and reduces their over-scheduling. Synchronous workloads will still | 72 | * and reduces their over-scheduling. Synchronous workloads will still |
69 | * have immediate wakeup/sleep latencies. | 73 | * have immediate wakeup/sleep latencies. |
70 | */ | 74 | */ |
71 | unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL; | 75 | const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL; |
72 | |||
73 | unsigned int sysctl_sched_stat_granularity __read_mostly; | ||
74 | |||
75 | /* | ||
76 | * Initialized in sched_init_granularity() [to 5 times the base granularity]: | ||
77 | */ | ||
78 | unsigned int sysctl_sched_runtime_limit __read_mostly; | ||
79 | |||
80 | /* | ||
81 | * Debugging: various feature bits | ||
82 | */ | ||
83 | enum { | ||
84 | SCHED_FEAT_FAIR_SLEEPERS = 1, | ||
85 | SCHED_FEAT_SLEEPER_AVG = 2, | ||
86 | SCHED_FEAT_SLEEPER_LOAD_AVG = 4, | ||
87 | SCHED_FEAT_PRECISE_CPU_LOAD = 8, | ||
88 | SCHED_FEAT_START_DEBIT = 16, | ||
89 | SCHED_FEAT_SKIP_INITIAL = 32, | ||
90 | }; | ||
91 | 76 | ||
92 | unsigned int sysctl_sched_features __read_mostly = | 77 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
93 | SCHED_FEAT_FAIR_SLEEPERS *1 | | ||
94 | SCHED_FEAT_SLEEPER_AVG *0 | | ||
95 | SCHED_FEAT_SLEEPER_LOAD_AVG *1 | | ||
96 | SCHED_FEAT_PRECISE_CPU_LOAD *1 | | ||
97 | SCHED_FEAT_START_DEBIT *1 | | ||
98 | SCHED_FEAT_SKIP_INITIAL *0; | ||
99 | |||
100 | extern struct sched_class fair_sched_class; | ||
101 | 78 | ||
102 | /************************************************************** | 79 | /************************************************************** |
103 | * CFS operations on generic schedulable entities: | 80 | * CFS operations on generic schedulable entities: |
@@ -111,21 +88,9 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
111 | return cfs_rq->rq; | 88 | return cfs_rq->rq; |
112 | } | 89 | } |
113 | 90 | ||
114 | /* currently running entity (if any) on this cfs_rq */ | ||
115 | static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) | ||
116 | { | ||
117 | return cfs_rq->curr; | ||
118 | } | ||
119 | |||
120 | /* An entity is a task if it doesn't "own" a runqueue */ | 91 | /* An entity is a task if it doesn't "own" a runqueue */ |
121 | #define entity_is_task(se) (!se->my_q) | 92 | #define entity_is_task(se) (!se->my_q) |
122 | 93 | ||
123 | static inline void | ||
124 | set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
125 | { | ||
126 | cfs_rq->curr = se; | ||
127 | } | ||
128 | |||
129 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 94 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
130 | 95 | ||
131 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | 96 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) |
@@ -133,21 +98,8 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
133 | return container_of(cfs_rq, struct rq, cfs); | 98 | return container_of(cfs_rq, struct rq, cfs); |
134 | } | 99 | } |
135 | 100 | ||
136 | static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) | ||
137 | { | ||
138 | struct rq *rq = rq_of(cfs_rq); | ||
139 | |||
140 | if (unlikely(rq->curr->sched_class != &fair_sched_class)) | ||
141 | return NULL; | ||
142 | |||
143 | return &rq->curr->se; | ||
144 | } | ||
145 | |||
146 | #define entity_is_task(se) 1 | 101 | #define entity_is_task(se) 1 |
147 | 102 | ||
148 | static inline void | ||
149 | set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { } | ||
150 | |||
151 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 103 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
152 | 104 | ||
153 | static inline struct task_struct *task_of(struct sched_entity *se) | 105 | static inline struct task_struct *task_of(struct sched_entity *se) |
@@ -160,16 +112,38 @@ static inline struct task_struct *task_of(struct sched_entity *se) | |||
160 | * Scheduling class tree data structure manipulation methods: | 112 | * Scheduling class tree data structure manipulation methods: |
161 | */ | 113 | */ |
162 | 114 | ||
115 | static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) | ||
116 | { | ||
117 | s64 delta = (s64)(vruntime - min_vruntime); | ||
118 | if (delta > 0) | ||
119 | min_vruntime = vruntime; | ||
120 | |||
121 | return min_vruntime; | ||
122 | } | ||
123 | |||
124 | static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) | ||
125 | { | ||
126 | s64 delta = (s64)(vruntime - min_vruntime); | ||
127 | if (delta < 0) | ||
128 | min_vruntime = vruntime; | ||
129 | |||
130 | return min_vruntime; | ||
131 | } | ||
132 | |||
133 | static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
134 | { | ||
135 | return se->vruntime - cfs_rq->min_vruntime; | ||
136 | } | ||
137 | |||
163 | /* | 138 | /* |
164 | * Enqueue an entity into the rb-tree: | 139 | * Enqueue an entity into the rb-tree: |
165 | */ | 140 | */ |
166 | static inline void | 141 | static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) |
167 | __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
168 | { | 142 | { |
169 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; | 143 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; |
170 | struct rb_node *parent = NULL; | 144 | struct rb_node *parent = NULL; |
171 | struct sched_entity *entry; | 145 | struct sched_entity *entry; |
172 | s64 key = se->fair_key; | 146 | s64 key = entity_key(cfs_rq, se); |
173 | int leftmost = 1; | 147 | int leftmost = 1; |
174 | 148 | ||
175 | /* | 149 | /* |
@@ -182,7 +156,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
182 | * We dont care about collisions. Nodes with | 156 | * We dont care about collisions. Nodes with |
183 | * the same key stay together. | 157 | * the same key stay together. |
184 | */ | 158 | */ |
185 | if (key - entry->fair_key < 0) { | 159 | if (key < entity_key(cfs_rq, entry)) { |
186 | link = &parent->rb_left; | 160 | link = &parent->rb_left; |
187 | } else { | 161 | } else { |
188 | link = &parent->rb_right; | 162 | link = &parent->rb_right; |
@@ -199,24 +173,14 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
199 | 173 | ||
200 | rb_link_node(&se->run_node, parent, link); | 174 | rb_link_node(&se->run_node, parent, link); |
201 | rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); | 175 | rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); |
202 | update_load_add(&cfs_rq->load, se->load.weight); | ||
203 | cfs_rq->nr_running++; | ||
204 | se->on_rq = 1; | ||
205 | |||
206 | schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); | ||
207 | } | 176 | } |
208 | 177 | ||
209 | static inline void | 178 | static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) |
210 | __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
211 | { | 179 | { |
212 | if (cfs_rq->rb_leftmost == &se->run_node) | 180 | if (cfs_rq->rb_leftmost == &se->run_node) |
213 | cfs_rq->rb_leftmost = rb_next(&se->run_node); | 181 | cfs_rq->rb_leftmost = rb_next(&se->run_node); |
214 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | ||
215 | update_load_sub(&cfs_rq->load, se->load.weight); | ||
216 | cfs_rq->nr_running--; | ||
217 | se->on_rq = 0; | ||
218 | 182 | ||
219 | schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); | 183 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); |
220 | } | 184 | } |
221 | 185 | ||
222 | static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) | 186 | static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) |
@@ -229,118 +193,86 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | |||
229 | return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); | 193 | return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); |
230 | } | 194 | } |
231 | 195 | ||
196 | static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | ||
197 | { | ||
198 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; | ||
199 | struct sched_entity *se = NULL; | ||
200 | struct rb_node *parent; | ||
201 | |||
202 | while (*link) { | ||
203 | parent = *link; | ||
204 | se = rb_entry(parent, struct sched_entity, run_node); | ||
205 | link = &parent->rb_right; | ||
206 | } | ||
207 | |||
208 | return se; | ||
209 | } | ||
210 | |||
232 | /************************************************************** | 211 | /************************************************************** |
233 | * Scheduling class statistics methods: | 212 | * Scheduling class statistics methods: |
234 | */ | 213 | */ |
235 | 214 | ||
215 | |||
236 | /* | 216 | /* |
237 | * Calculate the preemption granularity needed to schedule every | 217 | * The idea is to set a period in which each task runs once. |
238 | * runnable task once per sysctl_sched_latency amount of time. | ||
239 | * (down to a sensible low limit on granularity) | ||
240 | * | ||
241 | * For example, if there are 2 tasks running and latency is 10 msecs, | ||
242 | * we switch tasks every 5 msecs. If we have 3 tasks running, we have | ||
243 | * to switch tasks every 3.33 msecs to get a 10 msecs observed latency | ||
244 | * for each task. We do finer and finer scheduling up to until we | ||
245 | * reach the minimum granularity value. | ||
246 | * | ||
247 | * To achieve this we use the following dynamic-granularity rule: | ||
248 | * | 218 | * |
249 | * gran = lat/nr - lat/nr/nr | 219 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch |
220 | * this period because otherwise the slices get too small. | ||
250 | * | 221 | * |
251 | * This comes out of the following equations: | 222 | * p = (nr <= nl) ? l : l*nr/nl |
252 | * | ||
253 | * kA1 + gran = kB1 | ||
254 | * kB2 + gran = kA2 | ||
255 | * kA2 = kA1 | ||
256 | * kB2 = kB1 - d + d/nr | ||
257 | * lat = d * nr | ||
258 | * | ||
259 | * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running), | ||
260 | * '1' is start of time, '2' is end of time, 'd' is delay between | ||
261 | * 1 and 2 (during which task B was running), 'nr' is number of tasks | ||
262 | * running, 'lat' is the the period of each task. ('lat' is the | ||
263 | * sched_latency that we aim for.) | ||
264 | */ | 223 | */ |
265 | static long | 224 | static u64 __sched_period(unsigned long nr_running) |
266 | sched_granularity(struct cfs_rq *cfs_rq) | ||
267 | { | 225 | { |
268 | unsigned int gran = sysctl_sched_latency; | 226 | u64 period = sysctl_sched_latency; |
269 | unsigned int nr = cfs_rq->nr_running; | 227 | unsigned long nr_latency = sysctl_sched_nr_latency; |
270 | 228 | ||
271 | if (nr > 1) { | 229 | if (unlikely(nr_running > nr_latency)) { |
272 | gran = gran/nr - gran/nr/nr; | 230 | period *= nr_running; |
273 | gran = max(gran, sysctl_sched_min_granularity); | 231 | do_div(period, nr_latency); |
274 | } | 232 | } |
275 | 233 | ||
276 | return gran; | 234 | return period; |
277 | } | 235 | } |
278 | 236 | ||
279 | /* | 237 | /* |
280 | * We rescale the rescheduling granularity of tasks according to their | 238 | * We calculate the wall-time slice from the period by taking a part |
281 | * nice level, but only linearly, not exponentially: | 239 | * proportional to the weight. |
240 | * | ||
241 | * s = p*w/rw | ||
282 | */ | 242 | */ |
283 | static long | 243 | static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) |
284 | niced_granularity(struct sched_entity *curr, unsigned long granularity) | ||
285 | { | 244 | { |
286 | u64 tmp; | 245 | u64 slice = __sched_period(cfs_rq->nr_running); |
287 | 246 | ||
288 | if (likely(curr->load.weight == NICE_0_LOAD)) | 247 | slice *= se->load.weight; |
289 | return granularity; | 248 | do_div(slice, cfs_rq->load.weight); |
290 | /* | ||
291 | * Positive nice levels get the same granularity as nice-0: | ||
292 | */ | ||
293 | if (likely(curr->load.weight < NICE_0_LOAD)) { | ||
294 | tmp = curr->load.weight * (u64)granularity; | ||
295 | return (long) (tmp >> NICE_0_SHIFT); | ||
296 | } | ||
297 | /* | ||
298 | * Negative nice level tasks get linearly finer | ||
299 | * granularity: | ||
300 | */ | ||
301 | tmp = curr->load.inv_weight * (u64)granularity; | ||
302 | 249 | ||
303 | /* | 250 | return slice; |
304 | * It will always fit into 'long': | ||
305 | */ | ||
306 | return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT)); | ||
307 | } | 251 | } |
308 | 252 | ||
309 | static inline void | 253 | /* |
310 | limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se) | 254 | * We calculate the vruntime slice. |
255 | * | ||
256 | * vs = s/w = p/rw | ||
257 | */ | ||
258 | static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) | ||
311 | { | 259 | { |
312 | long limit = sysctl_sched_runtime_limit; | 260 | u64 vslice = __sched_period(nr_running); |
313 | 261 | ||
314 | /* | 262 | do_div(vslice, rq_weight); |
315 | * Niced tasks have the same history dynamic range as | 263 | |
316 | * non-niced tasks: | 264 | return vslice; |
317 | */ | ||
318 | if (unlikely(se->wait_runtime > limit)) { | ||
319 | se->wait_runtime = limit; | ||
320 | schedstat_inc(se, wait_runtime_overruns); | ||
321 | schedstat_inc(cfs_rq, wait_runtime_overruns); | ||
322 | } | ||
323 | if (unlikely(se->wait_runtime < -limit)) { | ||
324 | se->wait_runtime = -limit; | ||
325 | schedstat_inc(se, wait_runtime_underruns); | ||
326 | schedstat_inc(cfs_rq, wait_runtime_underruns); | ||
327 | } | ||
328 | } | 265 | } |
329 | 266 | ||
330 | static inline void | 267 | static u64 sched_vslice(struct cfs_rq *cfs_rq) |
331 | __add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) | ||
332 | { | 268 | { |
333 | se->wait_runtime += delta; | 269 | return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running); |
334 | schedstat_add(se, sum_wait_runtime, delta); | ||
335 | limit_wait_runtime(cfs_rq, se); | ||
336 | } | 270 | } |
337 | 271 | ||
338 | static void | 272 | static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) |
339 | add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) | ||
340 | { | 273 | { |
341 | schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); | 274 | return __sched_vslice(cfs_rq->load.weight + se->load.weight, |
342 | __add_wait_runtime(cfs_rq, se, delta); | 275 | cfs_rq->nr_running + 1); |
343 | schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); | ||
344 | } | 276 | } |
345 | 277 | ||
346 | /* | 278 | /* |
@@ -348,46 +280,41 @@ add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) | |||
348 | * are not in our scheduling class. | 280 | * are not in our scheduling class. |
349 | */ | 281 | */ |
350 | static inline void | 282 | static inline void |
351 | __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 283 | __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, |
284 | unsigned long delta_exec) | ||
352 | { | 285 | { |
353 | unsigned long delta, delta_exec, delta_fair, delta_mine; | 286 | unsigned long delta_exec_weighted; |
354 | struct load_weight *lw = &cfs_rq->load; | 287 | u64 vruntime; |
355 | unsigned long load = lw->weight; | ||
356 | 288 | ||
357 | delta_exec = curr->delta_exec; | ||
358 | schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); | 289 | schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); |
359 | 290 | ||
360 | curr->sum_exec_runtime += delta_exec; | 291 | curr->sum_exec_runtime += delta_exec; |
361 | cfs_rq->exec_clock += delta_exec; | 292 | schedstat_add(cfs_rq, exec_clock, delta_exec); |
362 | 293 | delta_exec_weighted = delta_exec; | |
363 | if (unlikely(!load)) | 294 | if (unlikely(curr->load.weight != NICE_0_LOAD)) { |
364 | return; | 295 | delta_exec_weighted = calc_delta_fair(delta_exec_weighted, |
365 | 296 | &curr->load); | |
366 | delta_fair = calc_delta_fair(delta_exec, lw); | ||
367 | delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); | ||
368 | |||
369 | if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) { | ||
370 | delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); | ||
371 | delta = min(delta, (unsigned long)( | ||
372 | (long)sysctl_sched_runtime_limit - curr->wait_runtime)); | ||
373 | cfs_rq->sleeper_bonus -= delta; | ||
374 | delta_mine -= delta; | ||
375 | } | 297 | } |
298 | curr->vruntime += delta_exec_weighted; | ||
376 | 299 | ||
377 | cfs_rq->fair_clock += delta_fair; | ||
378 | /* | 300 | /* |
379 | * We executed delta_exec amount of time on the CPU, | 301 | * maintain cfs_rq->min_vruntime to be a monotonic increasing |
380 | * but we were only entitled to delta_mine amount of | 302 | * value tracking the leftmost vruntime in the tree. |
381 | * time during that period (if nr_running == 1 then | ||
382 | * the two values are equal) | ||
383 | * [Note: delta_mine - delta_exec is negative]: | ||
384 | */ | 303 | */ |
385 | add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); | 304 | if (first_fair(cfs_rq)) { |
305 | vruntime = min_vruntime(curr->vruntime, | ||
306 | __pick_next_entity(cfs_rq)->vruntime); | ||
307 | } else | ||
308 | vruntime = curr->vruntime; | ||
309 | |||
310 | cfs_rq->min_vruntime = | ||
311 | max_vruntime(cfs_rq->min_vruntime, vruntime); | ||
386 | } | 312 | } |
387 | 313 | ||
388 | static void update_curr(struct cfs_rq *cfs_rq) | 314 | static void update_curr(struct cfs_rq *cfs_rq) |
389 | { | 315 | { |
390 | struct sched_entity *curr = cfs_rq_curr(cfs_rq); | 316 | struct sched_entity *curr = cfs_rq->curr; |
317 | u64 now = rq_of(cfs_rq)->clock; | ||
391 | unsigned long delta_exec; | 318 | unsigned long delta_exec; |
392 | 319 | ||
393 | if (unlikely(!curr)) | 320 | if (unlikely(!curr)) |
@@ -398,135 +325,47 @@ static void update_curr(struct cfs_rq *cfs_rq) | |||
398 | * since the last time we changed load (this cannot | 325 | * since the last time we changed load (this cannot |
399 | * overflow on 32 bits): | 326 | * overflow on 32 bits): |
400 | */ | 327 | */ |
401 | delta_exec = (unsigned long)(rq_of(cfs_rq)->clock - curr->exec_start); | 328 | delta_exec = (unsigned long)(now - curr->exec_start); |
402 | 329 | ||
403 | curr->delta_exec += delta_exec; | 330 | __update_curr(cfs_rq, curr, delta_exec); |
404 | 331 | curr->exec_start = now; | |
405 | if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) { | ||
406 | __update_curr(cfs_rq, curr); | ||
407 | curr->delta_exec = 0; | ||
408 | } | ||
409 | curr->exec_start = rq_of(cfs_rq)->clock; | ||
410 | } | 332 | } |
411 | 333 | ||
412 | static inline void | 334 | static inline void |
413 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | 335 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) |
414 | { | 336 | { |
415 | se->wait_start_fair = cfs_rq->fair_clock; | ||
416 | schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); | 337 | schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); |
417 | } | 338 | } |
418 | 339 | ||
419 | /* | 340 | /* |
420 | * We calculate fair deltas here, so protect against the random effects | ||
421 | * of a multiplication overflow by capping it to the runtime limit: | ||
422 | */ | ||
423 | #if BITS_PER_LONG == 32 | ||
424 | static inline unsigned long | ||
425 | calc_weighted(unsigned long delta, unsigned long weight, int shift) | ||
426 | { | ||
427 | u64 tmp = (u64)delta * weight >> shift; | ||
428 | |||
429 | if (unlikely(tmp > sysctl_sched_runtime_limit*2)) | ||
430 | return sysctl_sched_runtime_limit*2; | ||
431 | return tmp; | ||
432 | } | ||
433 | #else | ||
434 | static inline unsigned long | ||
435 | calc_weighted(unsigned long delta, unsigned long weight, int shift) | ||
436 | { | ||
437 | return delta * weight >> shift; | ||
438 | } | ||
439 | #endif | ||
440 | |||
441 | /* | ||
442 | * Task is being enqueued - update stats: | 341 | * Task is being enqueued - update stats: |
443 | */ | 342 | */ |
444 | static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 343 | static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
445 | { | 344 | { |
446 | s64 key; | ||
447 | |||
448 | /* | 345 | /* |
449 | * Are we enqueueing a waiting task? (for current tasks | 346 | * Are we enqueueing a waiting task? (for current tasks |
450 | * a dequeue/enqueue event is a NOP) | 347 | * a dequeue/enqueue event is a NOP) |
451 | */ | 348 | */ |
452 | if (se != cfs_rq_curr(cfs_rq)) | 349 | if (se != cfs_rq->curr) |
453 | update_stats_wait_start(cfs_rq, se); | 350 | update_stats_wait_start(cfs_rq, se); |
454 | /* | ||
455 | * Update the key: | ||
456 | */ | ||
457 | key = cfs_rq->fair_clock; | ||
458 | |||
459 | /* | ||
460 | * Optimize the common nice 0 case: | ||
461 | */ | ||
462 | if (likely(se->load.weight == NICE_0_LOAD)) { | ||
463 | key -= se->wait_runtime; | ||
464 | } else { | ||
465 | u64 tmp; | ||
466 | |||
467 | if (se->wait_runtime < 0) { | ||
468 | tmp = -se->wait_runtime; | ||
469 | key += (tmp * se->load.inv_weight) >> | ||
470 | (WMULT_SHIFT - NICE_0_SHIFT); | ||
471 | } else { | ||
472 | tmp = se->wait_runtime; | ||
473 | key -= (tmp * se->load.inv_weight) >> | ||
474 | (WMULT_SHIFT - NICE_0_SHIFT); | ||
475 | } | ||
476 | } | ||
477 | |||
478 | se->fair_key = key; | ||
479 | } | ||
480 | |||
481 | /* | ||
482 | * Note: must be called with a freshly updated rq->fair_clock. | ||
483 | */ | ||
484 | static inline void | ||
485 | __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
486 | { | ||
487 | unsigned long delta_fair = se->delta_fair_run; | ||
488 | |||
489 | schedstat_set(se->wait_max, max(se->wait_max, | ||
490 | rq_of(cfs_rq)->clock - se->wait_start)); | ||
491 | |||
492 | if (unlikely(se->load.weight != NICE_0_LOAD)) | ||
493 | delta_fair = calc_weighted(delta_fair, se->load.weight, | ||
494 | NICE_0_SHIFT); | ||
495 | |||
496 | add_wait_runtime(cfs_rq, se, delta_fair); | ||
497 | } | 351 | } |
498 | 352 | ||
499 | static void | 353 | static void |
500 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | 354 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) |
501 | { | 355 | { |
502 | unsigned long delta_fair; | 356 | schedstat_set(se->wait_max, max(se->wait_max, |
503 | 357 | rq_of(cfs_rq)->clock - se->wait_start)); | |
504 | if (unlikely(!se->wait_start_fair)) | ||
505 | return; | ||
506 | |||
507 | delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), | ||
508 | (u64)(cfs_rq->fair_clock - se->wait_start_fair)); | ||
509 | |||
510 | se->delta_fair_run += delta_fair; | ||
511 | if (unlikely(abs(se->delta_fair_run) >= | ||
512 | sysctl_sched_stat_granularity)) { | ||
513 | __update_stats_wait_end(cfs_rq, se); | ||
514 | se->delta_fair_run = 0; | ||
515 | } | ||
516 | |||
517 | se->wait_start_fair = 0; | ||
518 | schedstat_set(se->wait_start, 0); | 358 | schedstat_set(se->wait_start, 0); |
519 | } | 359 | } |
520 | 360 | ||
521 | static inline void | 361 | static inline void |
522 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 362 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
523 | { | 363 | { |
524 | update_curr(cfs_rq); | ||
525 | /* | 364 | /* |
526 | * Mark the end of the wait period if dequeueing a | 365 | * Mark the end of the wait period if dequeueing a |
527 | * waiting task: | 366 | * waiting task: |
528 | */ | 367 | */ |
529 | if (se != cfs_rq_curr(cfs_rq)) | 368 | if (se != cfs_rq->curr) |
530 | update_stats_wait_end(cfs_rq, se); | 369 | update_stats_wait_end(cfs_rq, se); |
531 | } | 370 | } |
532 | 371 | ||
@@ -542,79 +381,28 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
542 | se->exec_start = rq_of(cfs_rq)->clock; | 381 | se->exec_start = rq_of(cfs_rq)->clock; |
543 | } | 382 | } |
544 | 383 | ||
545 | /* | ||
546 | * We are descheduling a task - update its stats: | ||
547 | */ | ||
548 | static inline void | ||
549 | update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
550 | { | ||
551 | se->exec_start = 0; | ||
552 | } | ||
553 | |||
554 | /************************************************** | 384 | /************************************************** |
555 | * Scheduling class queueing methods: | 385 | * Scheduling class queueing methods: |
556 | */ | 386 | */ |
557 | 387 | ||
558 | static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 388 | static void |
389 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
559 | { | 390 | { |
560 | unsigned long load = cfs_rq->load.weight, delta_fair; | 391 | update_load_add(&cfs_rq->load, se->load.weight); |
561 | long prev_runtime; | 392 | cfs_rq->nr_running++; |
562 | 393 | se->on_rq = 1; | |
563 | /* | 394 | } |
564 | * Do not boost sleepers if there's too much bonus 'in flight' | ||
565 | * already: | ||
566 | */ | ||
567 | if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit)) | ||
568 | return; | ||
569 | |||
570 | if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) | ||
571 | load = rq_of(cfs_rq)->cpu_load[2]; | ||
572 | |||
573 | delta_fair = se->delta_fair_sleep; | ||
574 | |||
575 | /* | ||
576 | * Fix up delta_fair with the effect of us running | ||
577 | * during the whole sleep period: | ||
578 | */ | ||
579 | if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG) | ||
580 | delta_fair = div64_likely32((u64)delta_fair * load, | ||
581 | load + se->load.weight); | ||
582 | |||
583 | if (unlikely(se->load.weight != NICE_0_LOAD)) | ||
584 | delta_fair = calc_weighted(delta_fair, se->load.weight, | ||
585 | NICE_0_SHIFT); | ||
586 | |||
587 | prev_runtime = se->wait_runtime; | ||
588 | __add_wait_runtime(cfs_rq, se, delta_fair); | ||
589 | delta_fair = se->wait_runtime - prev_runtime; | ||
590 | 395 | ||
591 | /* | 396 | static void |
592 | * Track the amount of bonus we've given to sleepers: | 397 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
593 | */ | 398 | { |
594 | cfs_rq->sleeper_bonus += delta_fair; | 399 | update_load_sub(&cfs_rq->load, se->load.weight); |
400 | cfs_rq->nr_running--; | ||
401 | se->on_rq = 0; | ||
595 | } | 402 | } |
596 | 403 | ||
597 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 404 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
598 | { | 405 | { |
599 | struct task_struct *tsk = task_of(se); | ||
600 | unsigned long delta_fair; | ||
601 | |||
602 | if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) || | ||
603 | !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS)) | ||
604 | return; | ||
605 | |||
606 | delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), | ||
607 | (u64)(cfs_rq->fair_clock - se->sleep_start_fair)); | ||
608 | |||
609 | se->delta_fair_sleep += delta_fair; | ||
610 | if (unlikely(abs(se->delta_fair_sleep) >= | ||
611 | sysctl_sched_stat_granularity)) { | ||
612 | __enqueue_sleeper(cfs_rq, se); | ||
613 | se->delta_fair_sleep = 0; | ||
614 | } | ||
615 | |||
616 | se->sleep_start_fair = 0; | ||
617 | |||
618 | #ifdef CONFIG_SCHEDSTATS | 406 | #ifdef CONFIG_SCHEDSTATS |
619 | if (se->sleep_start) { | 407 | if (se->sleep_start) { |
620 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; | 408 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; |
@@ -646,6 +434,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
646 | * time that the task spent sleeping: | 434 | * time that the task spent sleeping: |
647 | */ | 435 | */ |
648 | if (unlikely(prof_on == SLEEP_PROFILING)) { | 436 | if (unlikely(prof_on == SLEEP_PROFILING)) { |
437 | struct task_struct *tsk = task_of(se); | ||
438 | |||
649 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), | 439 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), |
650 | delta >> 20); | 440 | delta >> 20); |
651 | } | 441 | } |
@@ -653,27 +443,81 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
653 | #endif | 443 | #endif |
654 | } | 444 | } |
655 | 445 | ||
446 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
447 | { | ||
448 | #ifdef CONFIG_SCHED_DEBUG | ||
449 | s64 d = se->vruntime - cfs_rq->min_vruntime; | ||
450 | |||
451 | if (d < 0) | ||
452 | d = -d; | ||
453 | |||
454 | if (d > 3*sysctl_sched_latency) | ||
455 | schedstat_inc(cfs_rq, nr_spread_over); | ||
456 | #endif | ||
457 | } | ||
458 | |||
459 | static void | ||
460 | place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | ||
461 | { | ||
462 | u64 vruntime; | ||
463 | |||
464 | vruntime = cfs_rq->min_vruntime; | ||
465 | |||
466 | if (sched_feat(TREE_AVG)) { | ||
467 | struct sched_entity *last = __pick_last_entity(cfs_rq); | ||
468 | if (last) { | ||
469 | vruntime += last->vruntime; | ||
470 | vruntime >>= 1; | ||
471 | } | ||
472 | } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) | ||
473 | vruntime += sched_vslice(cfs_rq)/2; | ||
474 | |||
475 | if (initial && sched_feat(START_DEBIT)) | ||
476 | vruntime += sched_vslice_add(cfs_rq, se); | ||
477 | |||
478 | if (!initial) { | ||
479 | if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && | ||
480 | task_of(se)->policy != SCHED_BATCH) | ||
481 | vruntime -= sysctl_sched_latency; | ||
482 | |||
483 | vruntime = max_t(s64, vruntime, se->vruntime); | ||
484 | } | ||
485 | |||
486 | se->vruntime = vruntime; | ||
487 | |||
488 | } | ||
489 | |||
656 | static void | 490 | static void |
657 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) | 491 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) |
658 | { | 492 | { |
659 | /* | 493 | /* |
660 | * Update the fair clock. | 494 | * Update run-time statistics of the 'current'. |
661 | */ | 495 | */ |
662 | update_curr(cfs_rq); | 496 | update_curr(cfs_rq); |
663 | 497 | ||
664 | if (wakeup) | 498 | if (wakeup) { |
499 | place_entity(cfs_rq, se, 0); | ||
665 | enqueue_sleeper(cfs_rq, se); | 500 | enqueue_sleeper(cfs_rq, se); |
501 | } | ||
666 | 502 | ||
667 | update_stats_enqueue(cfs_rq, se); | 503 | update_stats_enqueue(cfs_rq, se); |
668 | __enqueue_entity(cfs_rq, se); | 504 | check_spread(cfs_rq, se); |
505 | if (se != cfs_rq->curr) | ||
506 | __enqueue_entity(cfs_rq, se); | ||
507 | account_entity_enqueue(cfs_rq, se); | ||
669 | } | 508 | } |
670 | 509 | ||
671 | static void | 510 | static void |
672 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | 511 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) |
673 | { | 512 | { |
513 | /* | ||
514 | * Update run-time statistics of the 'current'. | ||
515 | */ | ||
516 | update_curr(cfs_rq); | ||
517 | |||
674 | update_stats_dequeue(cfs_rq, se); | 518 | update_stats_dequeue(cfs_rq, se); |
675 | if (sleep) { | 519 | if (sleep) { |
676 | se->sleep_start_fair = cfs_rq->fair_clock; | 520 | se->peer_preempt = 0; |
677 | #ifdef CONFIG_SCHEDSTATS | 521 | #ifdef CONFIG_SCHEDSTATS |
678 | if (entity_is_task(se)) { | 522 | if (entity_is_task(se)) { |
679 | struct task_struct *tsk = task_of(se); | 523 | struct task_struct *tsk = task_of(se); |
@@ -685,68 +529,66 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | |||
685 | } | 529 | } |
686 | #endif | 530 | #endif |
687 | } | 531 | } |
688 | __dequeue_entity(cfs_rq, se); | 532 | |
533 | if (se != cfs_rq->curr) | ||
534 | __dequeue_entity(cfs_rq, se); | ||
535 | account_entity_dequeue(cfs_rq, se); | ||
689 | } | 536 | } |
690 | 537 | ||
691 | /* | 538 | /* |
692 | * Preempt the current task with a newly woken task if needed: | 539 | * Preempt the current task with a newly woken task if needed: |
693 | */ | 540 | */ |
694 | static void | 541 | static void |
695 | __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, | 542 | check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) |
696 | struct sched_entity *curr, unsigned long granularity) | ||
697 | { | 543 | { |
698 | s64 __delta = curr->fair_key - se->fair_key; | ||
699 | unsigned long ideal_runtime, delta_exec; | 544 | unsigned long ideal_runtime, delta_exec; |
700 | 545 | ||
701 | /* | 546 | ideal_runtime = sched_slice(cfs_rq, curr); |
702 | * ideal_runtime is compared against sum_exec_runtime, which is | ||
703 | * walltime, hence do not scale. | ||
704 | */ | ||
705 | ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running, | ||
706 | (unsigned long)sysctl_sched_min_granularity); | ||
707 | |||
708 | /* | ||
709 | * If we executed more than what the latency constraint suggests, | ||
710 | * reduce the rescheduling granularity. This way the total latency | ||
711 | * of how much a task is not scheduled converges to | ||
712 | * sysctl_sched_latency: | ||
713 | */ | ||
714 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; | 547 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; |
715 | if (delta_exec > ideal_runtime) | 548 | if (delta_exec > ideal_runtime || |
716 | granularity = 0; | 549 | (sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt)) |
717 | |||
718 | /* | ||
719 | * Take scheduling granularity into account - do not | ||
720 | * preempt the current task unless the best task has | ||
721 | * a larger than sched_granularity fairness advantage: | ||
722 | * | ||
723 | * scale granularity as key space is in fair_clock. | ||
724 | */ | ||
725 | if (__delta > niced_granularity(curr, granularity)) | ||
726 | resched_task(rq_of(cfs_rq)->curr); | 550 | resched_task(rq_of(cfs_rq)->curr); |
551 | curr->peer_preempt = 0; | ||
727 | } | 552 | } |
728 | 553 | ||
729 | static inline void | 554 | static void |
730 | set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | 555 | set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) |
731 | { | 556 | { |
557 | /* 'current' is not kept within the tree. */ | ||
558 | if (se->on_rq) { | ||
559 | /* | ||
560 | * Any task has to be enqueued before it get to execute on | ||
561 | * a CPU. So account for the time it spent waiting on the | ||
562 | * runqueue. | ||
563 | */ | ||
564 | update_stats_wait_end(cfs_rq, se); | ||
565 | __dequeue_entity(cfs_rq, se); | ||
566 | } | ||
567 | |||
568 | update_stats_curr_start(cfs_rq, se); | ||
569 | cfs_rq->curr = se; | ||
570 | #ifdef CONFIG_SCHEDSTATS | ||
732 | /* | 571 | /* |
733 | * Any task has to be enqueued before it get to execute on | 572 | * Track our maximum slice length, if the CPU's load is at |
734 | * a CPU. So account for the time it spent waiting on the | 573 | * least twice that of our own weight (i.e. dont track it |
735 | * runqueue. (note, here we rely on pick_next_task() having | 574 | * when there are only lesser-weight tasks around): |
736 | * done a put_prev_task_fair() shortly before this, which | ||
737 | * updated rq->fair_clock - used by update_stats_wait_end()) | ||
738 | */ | 575 | */ |
739 | update_stats_wait_end(cfs_rq, se); | 576 | if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { |
740 | update_stats_curr_start(cfs_rq, se); | 577 | se->slice_max = max(se->slice_max, |
741 | set_cfs_rq_curr(cfs_rq, se); | 578 | se->sum_exec_runtime - se->prev_sum_exec_runtime); |
579 | } | ||
580 | #endif | ||
742 | se->prev_sum_exec_runtime = se->sum_exec_runtime; | 581 | se->prev_sum_exec_runtime = se->sum_exec_runtime; |
743 | } | 582 | } |
744 | 583 | ||
745 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | 584 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) |
746 | { | 585 | { |
747 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 586 | struct sched_entity *se = NULL; |
748 | 587 | ||
749 | set_next_entity(cfs_rq, se); | 588 | if (first_fair(cfs_rq)) { |
589 | se = __pick_next_entity(cfs_rq); | ||
590 | set_next_entity(cfs_rq, se); | ||
591 | } | ||
750 | 592 | ||
751 | return se; | 593 | return se; |
752 | } | 594 | } |
@@ -760,33 +602,24 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
760 | if (prev->on_rq) | 602 | if (prev->on_rq) |
761 | update_curr(cfs_rq); | 603 | update_curr(cfs_rq); |
762 | 604 | ||
763 | update_stats_curr_end(cfs_rq, prev); | 605 | check_spread(cfs_rq, prev); |
764 | 606 | if (prev->on_rq) { | |
765 | if (prev->on_rq) | ||
766 | update_stats_wait_start(cfs_rq, prev); | 607 | update_stats_wait_start(cfs_rq, prev); |
767 | set_cfs_rq_curr(cfs_rq, NULL); | 608 | /* Put 'current' back into the tree. */ |
609 | __enqueue_entity(cfs_rq, prev); | ||
610 | } | ||
611 | cfs_rq->curr = NULL; | ||
768 | } | 612 | } |
769 | 613 | ||
770 | static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 614 | static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) |
771 | { | 615 | { |
772 | struct sched_entity *next; | ||
773 | |||
774 | /* | 616 | /* |
775 | * Dequeue and enqueue the task to update its | 617 | * Update run-time statistics of the 'current'. |
776 | * position within the tree: | ||
777 | */ | 618 | */ |
778 | dequeue_entity(cfs_rq, curr, 0); | 619 | update_curr(cfs_rq); |
779 | enqueue_entity(cfs_rq, curr, 0); | ||
780 | |||
781 | /* | ||
782 | * Reschedule if another task tops the current one. | ||
783 | */ | ||
784 | next = __pick_next_entity(cfs_rq); | ||
785 | if (next == curr) | ||
786 | return; | ||
787 | 620 | ||
788 | __check_preempt_curr_fair(cfs_rq, next, curr, | 621 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) |
789 | sched_granularity(cfs_rq)); | 622 | check_preempt_tick(cfs_rq, curr); |
790 | } | 623 | } |
791 | 624 | ||
792 | /************************************************** | 625 | /************************************************** |
@@ -821,23 +654,28 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
821 | */ | 654 | */ |
822 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | 655 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) |
823 | { | 656 | { |
824 | /* A later patch will take group into account */ | 657 | return cfs_rq->tg->cfs_rq[this_cpu]; |
825 | return &cpu_rq(this_cpu)->cfs; | ||
826 | } | 658 | } |
827 | 659 | ||
828 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | 660 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ |
829 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 661 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
830 | list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 662 | list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) |
831 | 663 | ||
832 | /* Do the two (enqueued) tasks belong to the same group ? */ | 664 | /* Do the two (enqueued) entities belong to the same group ? */ |
833 | static inline int is_same_group(struct task_struct *curr, struct task_struct *p) | 665 | static inline int |
666 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
834 | { | 667 | { |
835 | if (curr->se.cfs_rq == p->se.cfs_rq) | 668 | if (se->cfs_rq == pse->cfs_rq) |
836 | return 1; | 669 | return 1; |
837 | 670 | ||
838 | return 0; | 671 | return 0; |
839 | } | 672 | } |
840 | 673 | ||
674 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
675 | { | ||
676 | return se->parent; | ||
677 | } | ||
678 | |||
841 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 679 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
842 | 680 | ||
843 | #define for_each_sched_entity(se) \ | 681 | #define for_each_sched_entity(se) \ |
@@ -870,11 +708,17 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
870 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 708 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
871 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | 709 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) |
872 | 710 | ||
873 | static inline int is_same_group(struct task_struct *curr, struct task_struct *p) | 711 | static inline int |
712 | is_same_group(struct sched_entity *se, struct sched_entity *pse) | ||
874 | { | 713 | { |
875 | return 1; | 714 | return 1; |
876 | } | 715 | } |
877 | 716 | ||
717 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | ||
718 | { | ||
719 | return NULL; | ||
720 | } | ||
721 | |||
878 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 722 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
879 | 723 | ||
880 | /* | 724 | /* |
@@ -892,6 +736,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | |||
892 | break; | 736 | break; |
893 | cfs_rq = cfs_rq_of(se); | 737 | cfs_rq = cfs_rq_of(se); |
894 | enqueue_entity(cfs_rq, se, wakeup); | 738 | enqueue_entity(cfs_rq, se, wakeup); |
739 | wakeup = 1; | ||
895 | } | 740 | } |
896 | } | 741 | } |
897 | 742 | ||
@@ -911,6 +756,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) | |||
911 | /* Don't dequeue parent if it has other entities besides us */ | 756 | /* Don't dequeue parent if it has other entities besides us */ |
912 | if (cfs_rq->load.weight) | 757 | if (cfs_rq->load.weight) |
913 | break; | 758 | break; |
759 | sleep = 1; | ||
914 | } | 760 | } |
915 | } | 761 | } |
916 | 762 | ||
@@ -919,12 +765,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) | |||
919 | * | 765 | * |
920 | * If compat_yield is turned on then we requeue to the end of the tree. | 766 | * If compat_yield is turned on then we requeue to the end of the tree. |
921 | */ | 767 | */ |
922 | static void yield_task_fair(struct rq *rq, struct task_struct *p) | 768 | static void yield_task_fair(struct rq *rq) |
923 | { | 769 | { |
924 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 770 | struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr); |
925 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; | 771 | struct sched_entity *rightmost, *se = &rq->curr->se; |
926 | struct sched_entity *rightmost, *se = &p->se; | ||
927 | struct rb_node *parent; | ||
928 | 772 | ||
929 | /* | 773 | /* |
930 | * Are we the only task in the tree? | 774 | * Are we the only task in the tree? |
@@ -935,52 +779,39 @@ static void yield_task_fair(struct rq *rq, struct task_struct *p) | |||
935 | if (likely(!sysctl_sched_compat_yield)) { | 779 | if (likely(!sysctl_sched_compat_yield)) { |
936 | __update_rq_clock(rq); | 780 | __update_rq_clock(rq); |
937 | /* | 781 | /* |
938 | * Dequeue and enqueue the task to update its | 782 | * Update run-time statistics of the 'current'. |
939 | * position within the tree: | ||
940 | */ | 783 | */ |
941 | dequeue_entity(cfs_rq, &p->se, 0); | 784 | update_curr(cfs_rq); |
942 | enqueue_entity(cfs_rq, &p->se, 0); | ||
943 | 785 | ||
944 | return; | 786 | return; |
945 | } | 787 | } |
946 | /* | 788 | /* |
947 | * Find the rightmost entry in the rbtree: | 789 | * Find the rightmost entry in the rbtree: |
948 | */ | 790 | */ |
949 | do { | 791 | rightmost = __pick_last_entity(cfs_rq); |
950 | parent = *link; | ||
951 | link = &parent->rb_right; | ||
952 | } while (*link); | ||
953 | |||
954 | rightmost = rb_entry(parent, struct sched_entity, run_node); | ||
955 | /* | 792 | /* |
956 | * Already in the rightmost position? | 793 | * Already in the rightmost position? |
957 | */ | 794 | */ |
958 | if (unlikely(rightmost == se)) | 795 | if (unlikely(rightmost->vruntime < se->vruntime)) |
959 | return; | 796 | return; |
960 | 797 | ||
961 | /* | 798 | /* |
962 | * Minimally necessary key value to be last in the tree: | 799 | * Minimally necessary key value to be last in the tree: |
800 | * Upon rescheduling, sched_class::put_prev_task() will place | ||
801 | * 'current' within the tree based on its new key value. | ||
963 | */ | 802 | */ |
964 | se->fair_key = rightmost->fair_key + 1; | 803 | se->vruntime = rightmost->vruntime + 1; |
965 | |||
966 | if (cfs_rq->rb_leftmost == &se->run_node) | ||
967 | cfs_rq->rb_leftmost = rb_next(&se->run_node); | ||
968 | /* | ||
969 | * Relink the task to the rightmost position: | ||
970 | */ | ||
971 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | ||
972 | rb_link_node(&se->run_node, parent, link); | ||
973 | rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); | ||
974 | } | 804 | } |
975 | 805 | ||
976 | /* | 806 | /* |
977 | * Preempt the current task with a newly woken task if needed: | 807 | * Preempt the current task with a newly woken task if needed: |
978 | */ | 808 | */ |
979 | static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) | 809 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) |
980 | { | 810 | { |
981 | struct task_struct *curr = rq->curr; | 811 | struct task_struct *curr = rq->curr; |
982 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 812 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
983 | unsigned long gran; | 813 | struct sched_entity *se = &curr->se, *pse = &p->se; |
814 | s64 delta, gran; | ||
984 | 815 | ||
985 | if (unlikely(rt_prio(p->prio))) { | 816 | if (unlikely(rt_prio(p->prio))) { |
986 | update_rq_clock(rq); | 817 | update_rq_clock(rq); |
@@ -988,16 +819,31 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) | |||
988 | resched_task(curr); | 819 | resched_task(curr); |
989 | return; | 820 | return; |
990 | } | 821 | } |
991 | |||
992 | gran = sysctl_sched_wakeup_granularity; | ||
993 | /* | 822 | /* |
994 | * Batch tasks prefer throughput over latency: | 823 | * Batch tasks do not preempt (their preemption is driven by |
824 | * the tick): | ||
995 | */ | 825 | */ |
996 | if (unlikely(p->policy == SCHED_BATCH)) | 826 | if (unlikely(p->policy == SCHED_BATCH)) |
997 | gran = sysctl_sched_batch_wakeup_granularity; | 827 | return; |
828 | |||
829 | if (sched_feat(WAKEUP_PREEMPT)) { | ||
830 | while (!is_same_group(se, pse)) { | ||
831 | se = parent_entity(se); | ||
832 | pse = parent_entity(pse); | ||
833 | } | ||
998 | 834 | ||
999 | if (is_same_group(curr, p)) | 835 | delta = se->vruntime - pse->vruntime; |
1000 | __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); | 836 | gran = sysctl_sched_wakeup_granularity; |
837 | if (unlikely(se->load.weight != NICE_0_LOAD)) | ||
838 | gran = calc_delta_fair(gran, &se->load); | ||
839 | |||
840 | if (delta > gran) { | ||
841 | int now = !sched_feat(PREEMPT_RESTRICT); | ||
842 | |||
843 | if (now || p->prio < curr->prio || !se->peer_preempt++) | ||
844 | resched_task(curr); | ||
845 | } | ||
846 | } | ||
1001 | } | 847 | } |
1002 | 848 | ||
1003 | static struct task_struct *pick_next_task_fair(struct rq *rq) | 849 | static struct task_struct *pick_next_task_fair(struct rq *rq) |
@@ -1041,7 +887,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
1041 | * achieve that by always pre-iterating before returning | 887 | * achieve that by always pre-iterating before returning |
1042 | * the current task: | 888 | * the current task: |
1043 | */ | 889 | */ |
1044 | static inline struct task_struct * | 890 | static struct task_struct * |
1045 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) | 891 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) |
1046 | { | 892 | { |
1047 | struct task_struct *p; | 893 | struct task_struct *p; |
@@ -1078,7 +924,10 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | |||
1078 | if (!cfs_rq->nr_running) | 924 | if (!cfs_rq->nr_running) |
1079 | return MAX_PRIO; | 925 | return MAX_PRIO; |
1080 | 926 | ||
1081 | curr = __pick_next_entity(cfs_rq); | 927 | curr = cfs_rq->curr; |
928 | if (!curr) | ||
929 | curr = __pick_next_entity(cfs_rq); | ||
930 | |||
1082 | p = task_of(curr); | 931 | p = task_of(curr); |
1083 | 932 | ||
1084 | return p->prio; | 933 | return p->prio; |
@@ -1153,6 +1002,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr) | |||
1153 | } | 1002 | } |
1154 | } | 1003 | } |
1155 | 1004 | ||
1005 | #define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0) | ||
1006 | |||
1156 | /* | 1007 | /* |
1157 | * Share the fairness runtime between parent and child, thus the | 1008 | * Share the fairness runtime between parent and child, thus the |
1158 | * total amount of pressure for CPU stays equal - new tasks | 1009 | * total amount of pressure for CPU stays equal - new tasks |
@@ -1163,37 +1014,32 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr) | |||
1163 | static void task_new_fair(struct rq *rq, struct task_struct *p) | 1014 | static void task_new_fair(struct rq *rq, struct task_struct *p) |
1164 | { | 1015 | { |
1165 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 1016 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
1166 | struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq); | 1017 | struct sched_entity *se = &p->se, *curr = cfs_rq->curr; |
1018 | int this_cpu = smp_processor_id(); | ||
1167 | 1019 | ||
1168 | sched_info_queued(p); | 1020 | sched_info_queued(p); |
1169 | 1021 | ||
1170 | update_curr(cfs_rq); | 1022 | update_curr(cfs_rq); |
1171 | update_stats_enqueue(cfs_rq, se); | 1023 | place_entity(cfs_rq, se, 1); |
1172 | /* | ||
1173 | * Child runs first: we let it run before the parent | ||
1174 | * until it reschedules once. We set up the key so that | ||
1175 | * it will preempt the parent: | ||
1176 | */ | ||
1177 | se->fair_key = curr->fair_key - | ||
1178 | niced_granularity(curr, sched_granularity(cfs_rq)) - 1; | ||
1179 | /* | ||
1180 | * The first wait is dominated by the child-runs-first logic, | ||
1181 | * so do not credit it with that waiting time yet: | ||
1182 | */ | ||
1183 | if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) | ||
1184 | se->wait_start_fair = 0; | ||
1185 | 1024 | ||
1186 | /* | 1025 | if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && |
1187 | * The statistical average of wait_runtime is about | 1026 | curr->vruntime < se->vruntime) { |
1188 | * -granularity/2, so initialize the task with that: | 1027 | /* |
1189 | */ | 1028 | * Upon rescheduling, sched_class::put_prev_task() will place |
1190 | if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) | 1029 | * 'current' within the tree based on its new key value. |
1191 | se->wait_runtime = -(sched_granularity(cfs_rq) / 2); | 1030 | */ |
1031 | swap(curr->vruntime, se->vruntime); | ||
1032 | } | ||
1192 | 1033 | ||
1034 | update_stats_enqueue(cfs_rq, se); | ||
1035 | check_spread(cfs_rq, se); | ||
1036 | check_spread(cfs_rq, curr); | ||
1193 | __enqueue_entity(cfs_rq, se); | 1037 | __enqueue_entity(cfs_rq, se); |
1038 | account_entity_enqueue(cfs_rq, se); | ||
1039 | se->peer_preempt = 0; | ||
1040 | resched_task(rq->curr); | ||
1194 | } | 1041 | } |
1195 | 1042 | ||
1196 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1197 | /* Account for a task changing its policy or group. | 1043 | /* Account for a task changing its policy or group. |
1198 | * | 1044 | * |
1199 | * This routine is mostly called to set cfs_rq->curr field when a task | 1045 | * This routine is mostly called to set cfs_rq->curr field when a task |
@@ -1206,21 +1052,17 @@ static void set_curr_task_fair(struct rq *rq) | |||
1206 | for_each_sched_entity(se) | 1052 | for_each_sched_entity(se) |
1207 | set_next_entity(cfs_rq_of(se), se); | 1053 | set_next_entity(cfs_rq_of(se), se); |
1208 | } | 1054 | } |
1209 | #else | ||
1210 | static void set_curr_task_fair(struct rq *rq) | ||
1211 | { | ||
1212 | } | ||
1213 | #endif | ||
1214 | 1055 | ||
1215 | /* | 1056 | /* |
1216 | * All the scheduling class methods: | 1057 | * All the scheduling class methods: |
1217 | */ | 1058 | */ |
1218 | struct sched_class fair_sched_class __read_mostly = { | 1059 | static const struct sched_class fair_sched_class = { |
1060 | .next = &idle_sched_class, | ||
1219 | .enqueue_task = enqueue_task_fair, | 1061 | .enqueue_task = enqueue_task_fair, |
1220 | .dequeue_task = dequeue_task_fair, | 1062 | .dequeue_task = dequeue_task_fair, |
1221 | .yield_task = yield_task_fair, | 1063 | .yield_task = yield_task_fair, |
1222 | 1064 | ||
1223 | .check_preempt_curr = check_preempt_curr_fair, | 1065 | .check_preempt_curr = check_preempt_wakeup, |
1224 | 1066 | ||
1225 | .pick_next_task = pick_next_task_fair, | 1067 | .pick_next_task = pick_next_task_fair, |
1226 | .put_prev_task = put_prev_task_fair, | 1068 | .put_prev_task = put_prev_task_fair, |
@@ -1237,6 +1079,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu) | |||
1237 | { | 1079 | { |
1238 | struct cfs_rq *cfs_rq; | 1080 | struct cfs_rq *cfs_rq; |
1239 | 1081 | ||
1082 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1083 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); | ||
1084 | #endif | ||
1240 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) | 1085 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) |
1241 | print_cfs_rq(m, cpu, cfs_rq); | 1086 | print_cfs_rq(m, cpu, cfs_rq); |
1242 | } | 1087 | } |