aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c443
1 files changed, 228 insertions, 215 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fb8994c6d4bb..98345e45b059 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -73,6 +73,8 @@ unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
73 73
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 75
76static const struct sched_class fair_sched_class;
77
76/************************************************************** 78/**************************************************************
77 * CFS operations on generic schedulable entities: 79 * CFS operations on generic schedulable entities:
78 */ 80 */
@@ -141,6 +143,49 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
141 return se->parent; 143 return se->parent;
142} 144}
143 145
146/* return depth at which a sched entity is present in the hierarchy */
147static inline int depth_se(struct sched_entity *se)
148{
149 int depth = 0;
150
151 for_each_sched_entity(se)
152 depth++;
153
154 return depth;
155}
156
157static void
158find_matching_se(struct sched_entity **se, struct sched_entity **pse)
159{
160 int se_depth, pse_depth;
161
162 /*
163 * preemption test can be made between sibling entities who are in the
164 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
165 * both tasks until we find their ancestors who are siblings of common
166 * parent.
167 */
168
169 /* First walk up until both entities are at same depth */
170 se_depth = depth_se(*se);
171 pse_depth = depth_se(*pse);
172
173 while (se_depth > pse_depth) {
174 se_depth--;
175 *se = parent_entity(*se);
176 }
177
178 while (pse_depth > se_depth) {
179 pse_depth--;
180 *pse = parent_entity(*pse);
181 }
182
183 while (!is_same_group(*se, *pse)) {
184 *se = parent_entity(*se);
185 *pse = parent_entity(*pse);
186 }
187}
188
144#else /* CONFIG_FAIR_GROUP_SCHED */ 189#else /* CONFIG_FAIR_GROUP_SCHED */
145 190
146static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 191static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -191,6 +236,11 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
191 return NULL; 236 return NULL;
192} 237}
193 238
239static inline void
240find_matching_se(struct sched_entity **se, struct sched_entity **pse)
241{
242}
243
194#endif /* CONFIG_FAIR_GROUP_SCHED */ 244#endif /* CONFIG_FAIR_GROUP_SCHED */
195 245
196 246
@@ -221,6 +271,27 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
221 return se->vruntime - cfs_rq->min_vruntime; 271 return se->vruntime - cfs_rq->min_vruntime;
222} 272}
223 273
274static void update_min_vruntime(struct cfs_rq *cfs_rq)
275{
276 u64 vruntime = cfs_rq->min_vruntime;
277
278 if (cfs_rq->curr)
279 vruntime = cfs_rq->curr->vruntime;
280
281 if (cfs_rq->rb_leftmost) {
282 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
283 struct sched_entity,
284 run_node);
285
286 if (vruntime == cfs_rq->min_vruntime)
287 vruntime = se->vruntime;
288 else
289 vruntime = min_vruntime(vruntime, se->vruntime);
290 }
291
292 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
293}
294
224/* 295/*
225 * Enqueue an entity into the rb-tree: 296 * Enqueue an entity into the rb-tree:
226 */ 297 */
@@ -254,15 +325,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
254 * Maintain a cache of leftmost tree entries (it is frequently 325 * Maintain a cache of leftmost tree entries (it is frequently
255 * used): 326 * used):
256 */ 327 */
257 if (leftmost) { 328 if (leftmost)
258 cfs_rq->rb_leftmost = &se->run_node; 329 cfs_rq->rb_leftmost = &se->run_node;
259 /*
260 * maintain cfs_rq->min_vruntime to be a monotonic increasing
261 * value tracking the leftmost vruntime in the tree.
262 */
263 cfs_rq->min_vruntime =
264 max_vruntime(cfs_rq->min_vruntime, se->vruntime);
265 }
266 330
267 rb_link_node(&se->run_node, parent, link); 331 rb_link_node(&se->run_node, parent, link);
268 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); 332 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -272,37 +336,25 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
272{ 336{
273 if (cfs_rq->rb_leftmost == &se->run_node) { 337 if (cfs_rq->rb_leftmost == &se->run_node) {
274 struct rb_node *next_node; 338 struct rb_node *next_node;
275 struct sched_entity *next;
276 339
277 next_node = rb_next(&se->run_node); 340 next_node = rb_next(&se->run_node);
278 cfs_rq->rb_leftmost = next_node; 341 cfs_rq->rb_leftmost = next_node;
279
280 if (next_node) {
281 next = rb_entry(next_node,
282 struct sched_entity, run_node);
283 cfs_rq->min_vruntime =
284 max_vruntime(cfs_rq->min_vruntime,
285 next->vruntime);
286 }
287 } 342 }
288 343
289 if (cfs_rq->next == se)
290 cfs_rq->next = NULL;
291
292 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 344 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
293} 345}
294 346
295static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
296{
297 return cfs_rq->rb_leftmost;
298}
299
300static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) 347static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
301{ 348{
302 return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); 349 struct rb_node *left = cfs_rq->rb_leftmost;
350
351 if (!left)
352 return NULL;
353
354 return rb_entry(left, struct sched_entity, run_node);
303} 355}
304 356
305static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 357static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
306{ 358{
307 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 359 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
308 360
@@ -334,7 +386,7 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
334#endif 386#endif
335 387
336/* 388/*
337 * delta *= w / rw 389 * delta *= P[w / rw]
338 */ 390 */
339static inline unsigned long 391static inline unsigned long
340calc_delta_weight(unsigned long delta, struct sched_entity *se) 392calc_delta_weight(unsigned long delta, struct sched_entity *se)
@@ -348,15 +400,13 @@ calc_delta_weight(unsigned long delta, struct sched_entity *se)
348} 400}
349 401
350/* 402/*
351 * delta *= rw / w 403 * delta /= w
352 */ 404 */
353static inline unsigned long 405static inline unsigned long
354calc_delta_fair(unsigned long delta, struct sched_entity *se) 406calc_delta_fair(unsigned long delta, struct sched_entity *se)
355{ 407{
356 for_each_sched_entity(se) { 408 if (unlikely(se->load.weight != NICE_0_LOAD))
357 delta = calc_delta_mine(delta, 409 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
358 cfs_rq_of(se)->load.weight, &se->load);
359 }
360 410
361 return delta; 411 return delta;
362} 412}
@@ -386,84 +436,26 @@ static u64 __sched_period(unsigned long nr_running)
386 * We calculate the wall-time slice from the period by taking a part 436 * We calculate the wall-time slice from the period by taking a part
387 * proportional to the weight. 437 * proportional to the weight.
388 * 438 *
389 * s = p*w/rw 439 * s = p*P[w/rw]
390 */ 440 */
391static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 441static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
392{ 442{
393 return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
394}
395
396/*
397 * We calculate the vruntime slice of a to be inserted task
398 *
399 * vs = s*rw/w = p
400 */
401static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
402{
403 unsigned long nr_running = cfs_rq->nr_running; 443 unsigned long nr_running = cfs_rq->nr_running;
404 444
405 if (!se->on_rq) 445 if (unlikely(!se->on_rq))
406 nr_running++; 446 nr_running++;
407 447
408 return __sched_period(nr_running); 448 return calc_delta_weight(__sched_period(nr_running), se);
409} 449}
410 450
411/* 451/*
412 * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in 452 * We calculate the vruntime slice of a to be inserted task
413 * that it favours >=0 over <0.
414 *
415 * -20 |
416 * |
417 * 0 --------+-------
418 * .'
419 * 19 .'
420 * 453 *
454 * vs = s/w
421 */ 455 */
422static unsigned long 456static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
423calc_delta_asym(unsigned long delta, struct sched_entity *se)
424{ 457{
425 struct load_weight lw = { 458 return calc_delta_fair(sched_slice(cfs_rq, se), se);
426 .weight = NICE_0_LOAD,
427 .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
428 };
429
430 for_each_sched_entity(se) {
431 struct load_weight *se_lw = &se->load;
432 unsigned long rw = cfs_rq_of(se)->load.weight;
433
434#ifdef CONFIG_FAIR_SCHED_GROUP
435 struct cfs_rq *cfs_rq = se->my_q;
436 struct task_group *tg = NULL
437
438 if (cfs_rq)
439 tg = cfs_rq->tg;
440
441 if (tg && tg->shares < NICE_0_LOAD) {
442 /*
443 * scale shares to what it would have been had
444 * tg->weight been NICE_0_LOAD:
445 *
446 * weight = 1024 * shares / tg->weight
447 */
448 lw.weight *= se->load.weight;
449 lw.weight /= tg->shares;
450
451 lw.inv_weight = 0;
452
453 se_lw = &lw;
454 rw += lw.weight - se->load.weight;
455 } else
456#endif
457
458 if (se->load.weight < NICE_0_LOAD) {
459 se_lw = &lw;
460 rw += NICE_0_LOAD - se->load.weight;
461 }
462
463 delta = calc_delta_mine(delta, rw, se_lw);
464 }
465
466 return delta;
467} 459}
468 460
469/* 461/*
@@ -482,6 +474,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
482 schedstat_add(cfs_rq, exec_clock, delta_exec); 474 schedstat_add(cfs_rq, exec_clock, delta_exec);
483 delta_exec_weighted = calc_delta_fair(delta_exec, curr); 475 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
484 curr->vruntime += delta_exec_weighted; 476 curr->vruntime += delta_exec_weighted;
477 update_min_vruntime(cfs_rq);
485} 478}
486 479
487static void update_curr(struct cfs_rq *cfs_rq) 480static void update_curr(struct cfs_rq *cfs_rq)
@@ -507,6 +500,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
507 struct task_struct *curtask = task_of(curr); 500 struct task_struct *curtask = task_of(curr);
508 501
509 cpuacct_charge(curtask, delta_exec); 502 cpuacct_charge(curtask, delta_exec);
503 account_group_exec_runtime(curtask, delta_exec);
510 } 504 }
511} 505}
512 506
@@ -586,11 +580,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
586 update_load_add(&cfs_rq->load, se->load.weight); 580 update_load_add(&cfs_rq->load, se->load.weight);
587 if (!parent_entity(se)) 581 if (!parent_entity(se))
588 inc_cpu_load(rq_of(cfs_rq), se->load.weight); 582 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
589 if (entity_is_task(se)) 583 if (entity_is_task(se)) {
590 add_cfs_task_weight(cfs_rq, se->load.weight); 584 add_cfs_task_weight(cfs_rq, se->load.weight);
585 list_add(&se->group_node, &cfs_rq->tasks);
586 }
591 cfs_rq->nr_running++; 587 cfs_rq->nr_running++;
592 se->on_rq = 1; 588 se->on_rq = 1;
593 list_add(&se->group_node, &cfs_rq->tasks);
594} 589}
595 590
596static void 591static void
@@ -599,11 +594,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
599 update_load_sub(&cfs_rq->load, se->load.weight); 594 update_load_sub(&cfs_rq->load, se->load.weight);
600 if (!parent_entity(se)) 595 if (!parent_entity(se))
601 dec_cpu_load(rq_of(cfs_rq), se->load.weight); 596 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
602 if (entity_is_task(se)) 597 if (entity_is_task(se)) {
603 add_cfs_task_weight(cfs_rq, -se->load.weight); 598 add_cfs_task_weight(cfs_rq, -se->load.weight);
599 list_del_init(&se->group_node);
600 }
604 cfs_rq->nr_running--; 601 cfs_rq->nr_running--;
605 se->on_rq = 0; 602 se->on_rq = 0;
606 list_del_init(&se->group_node);
607} 603}
608 604
609static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 605static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -668,13 +664,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
668static void 664static void
669place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) 665place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
670{ 666{
671 u64 vruntime; 667 u64 vruntime = cfs_rq->min_vruntime;
672
673 if (first_fair(cfs_rq)) {
674 vruntime = min_vruntime(cfs_rq->min_vruntime,
675 __pick_next_entity(cfs_rq)->vruntime);
676 } else
677 vruntime = cfs_rq->min_vruntime;
678 668
679 /* 669 /*
680 * The 'current' period is already promised to the current tasks, 670 * The 'current' period is already promised to the current tasks,
@@ -683,7 +673,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
683 * stays open at the end. 673 * stays open at the end.
684 */ 674 */
685 if (initial && sched_feat(START_DEBIT)) 675 if (initial && sched_feat(START_DEBIT))
686 vruntime += sched_vslice_add(cfs_rq, se); 676 vruntime += sched_vslice(cfs_rq, se);
687 677
688 if (!initial) { 678 if (!initial) {
689 /* sleeps upto a single latency don't count. */ 679 /* sleeps upto a single latency don't count. */
@@ -726,6 +716,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
726 __enqueue_entity(cfs_rq, se); 716 __enqueue_entity(cfs_rq, se);
727} 717}
728 718
719static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
720{
721 if (cfs_rq->last == se)
722 cfs_rq->last = NULL;
723
724 if (cfs_rq->next == se)
725 cfs_rq->next = NULL;
726}
727
729static void 728static void
730dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 729dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
731{ 730{
@@ -748,9 +747,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
748#endif 747#endif
749 } 748 }
750 749
750 clear_buddies(cfs_rq, se);
751
751 if (se != cfs_rq->curr) 752 if (se != cfs_rq->curr)
752 __dequeue_entity(cfs_rq, se); 753 __dequeue_entity(cfs_rq, se);
753 account_entity_dequeue(cfs_rq, se); 754 account_entity_dequeue(cfs_rq, se);
755 update_min_vruntime(cfs_rq);
754} 756}
755 757
756/* 758/*
@@ -797,29 +799,18 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
797 se->prev_sum_exec_runtime = se->sum_exec_runtime; 799 se->prev_sum_exec_runtime = se->sum_exec_runtime;
798} 800}
799 801
800static struct sched_entity * 802static int
801pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) 803wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
802{
803 struct rq *rq = rq_of(cfs_rq);
804 u64 pair_slice = rq->clock - cfs_rq->pair_start;
805
806 if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
807 cfs_rq->pair_start = rq->clock;
808 return se;
809 }
810
811 return cfs_rq->next;
812}
813 804
814static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 805static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
815{ 806{
816 struct sched_entity *se = NULL; 807 struct sched_entity *se = __pick_next_entity(cfs_rq);
817 808
818 if (first_fair(cfs_rq)) { 809 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
819 se = __pick_next_entity(cfs_rq); 810 return cfs_rq->next;
820 se = pick_next(cfs_rq, se); 811
821 set_next_entity(cfs_rq, se); 812 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
822 } 813 return cfs_rq->last;
823 814
824 return se; 815 return se;
825} 816}
@@ -904,11 +895,31 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
904 hrtick_start(rq, delta); 895 hrtick_start(rq, delta);
905 } 896 }
906} 897}
898
899/*
900 * called from enqueue/dequeue and updates the hrtick when the
901 * current task is from our class and nr_running is low enough
902 * to matter.
903 */
904static void hrtick_update(struct rq *rq)
905{
906 struct task_struct *curr = rq->curr;
907
908 if (curr->sched_class != &fair_sched_class)
909 return;
910
911 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
912 hrtick_start_fair(rq, curr);
913}
907#else /* !CONFIG_SCHED_HRTICK */ 914#else /* !CONFIG_SCHED_HRTICK */
908static inline void 915static inline void
909hrtick_start_fair(struct rq *rq, struct task_struct *p) 916hrtick_start_fair(struct rq *rq, struct task_struct *p)
910{ 917{
911} 918}
919
920static inline void hrtick_update(struct rq *rq)
921{
922}
912#endif 923#endif
913 924
914/* 925/*
@@ -929,7 +940,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
929 wakeup = 1; 940 wakeup = 1;
930 } 941 }
931 942
932 hrtick_start_fair(rq, rq->curr); 943 hrtick_update(rq);
933} 944}
934 945
935/* 946/*
@@ -951,7 +962,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
951 sleep = 1; 962 sleep = 1;
952 } 963 }
953 964
954 hrtick_start_fair(rq, rq->curr); 965 hrtick_update(rq);
955} 966}
956 967
957/* 968/*
@@ -971,6 +982,8 @@ static void yield_task_fair(struct rq *rq)
971 if (unlikely(cfs_rq->nr_running == 1)) 982 if (unlikely(cfs_rq->nr_running == 1))
972 return; 983 return;
973 984
985 clear_buddies(cfs_rq, se);
986
974 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { 987 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
975 update_rq_clock(rq); 988 update_rq_clock(rq);
976 /* 989 /*
@@ -1057,8 +1070,6 @@ static inline int wake_idle(int cpu, struct task_struct *p)
1057 1070
1058#ifdef CONFIG_SMP 1071#ifdef CONFIG_SMP
1059 1072
1060static const struct sched_class fair_sched_class;
1061
1062#ifdef CONFIG_FAIR_GROUP_SCHED 1073#ifdef CONFIG_FAIR_GROUP_SCHED
1063/* 1074/*
1064 * effective_load() calculates the load change as seen from the root_task_group 1075 * effective_load() calculates the load change as seen from the root_task_group
@@ -1085,7 +1096,6 @@ static long effective_load(struct task_group *tg, int cpu,
1085 long wl, long wg) 1096 long wl, long wg)
1086{ 1097{
1087 struct sched_entity *se = tg->se[cpu]; 1098 struct sched_entity *se = tg->se[cpu];
1088 long more_w;
1089 1099
1090 if (!tg->parent) 1100 if (!tg->parent)
1091 return wl; 1101 return wl;
@@ -1097,18 +1107,17 @@ static long effective_load(struct task_group *tg, int cpu,
1097 if (!wl && sched_feat(ASYM_EFF_LOAD)) 1107 if (!wl && sched_feat(ASYM_EFF_LOAD))
1098 return wl; 1108 return wl;
1099 1109
1100 /*
1101 * Instead of using this increment, also add the difference
1102 * between when the shares were last updated and now.
1103 */
1104 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1105 wl += more_w;
1106 wg += more_w;
1107
1108 for_each_sched_entity(se) { 1110 for_each_sched_entity(se) {
1109#define D(n) (likely(n) ? (n) : 1)
1110
1111 long S, rw, s, a, b; 1111 long S, rw, s, a, b;
1112 long more_w;
1113
1114 /*
1115 * Instead of using this increment, also add the difference
1116 * between when the shares were last updated and now.
1117 */
1118 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1119 wl += more_w;
1120 wg += more_w;
1112 1121
1113 S = se->my_q->tg->shares; 1122 S = se->my_q->tg->shares;
1114 s = se->my_q->shares; 1123 s = se->my_q->shares;
@@ -1117,7 +1126,11 @@ static long effective_load(struct task_group *tg, int cpu,
1117 a = S*(rw + wl); 1126 a = S*(rw + wl);
1118 b = S*rw + s*wg; 1127 b = S*rw + s*wg;
1119 1128
1120 wl = s*(a-b)/D(b); 1129 wl = s*(a-b);
1130
1131 if (likely(b))
1132 wl /= b;
1133
1121 /* 1134 /*
1122 * Assume the group is already running and will 1135 * Assume the group is already running and will
1123 * thus already be accounted for in the weight. 1136 * thus already be accounted for in the weight.
@@ -1126,7 +1139,6 @@ static long effective_load(struct task_group *tg, int cpu,
1126 * alter the group weight. 1139 * alter the group weight.
1127 */ 1140 */
1128 wg = 0; 1141 wg = 0;
1129#undef D
1130 } 1142 }
1131 1143
1132 return wl; 1144 return wl;
@@ -1143,7 +1155,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1143#endif 1155#endif
1144 1156
1145static int 1157static int
1146wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, 1158wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1147 struct task_struct *p, int prev_cpu, int this_cpu, int sync, 1159 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1148 int idx, unsigned long load, unsigned long this_load, 1160 int idx, unsigned long load, unsigned long this_load,
1149 unsigned int imbalance) 1161 unsigned int imbalance)
@@ -1158,6 +1170,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1158 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1170 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
1159 return 0; 1171 return 0;
1160 1172
1173 if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1174 p->se.avg_overlap > sysctl_sched_migration_cost))
1175 sync = 0;
1176
1161 /* 1177 /*
1162 * If sync wakeup then subtract the (maximum possible) 1178 * If sync wakeup then subtract the (maximum possible)
1163 * effect of the currently running task from the load 1179 * effect of the currently running task from the load
@@ -1182,17 +1198,14 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1182 * a reasonable amount of time then attract this newly 1198 * a reasonable amount of time then attract this newly
1183 * woken task: 1199 * woken task:
1184 */ 1200 */
1185 if (sync && balanced) { 1201 if (sync && balanced)
1186 if (curr->se.avg_overlap < sysctl_sched_migration_cost && 1202 return 1;
1187 p->se.avg_overlap < sysctl_sched_migration_cost)
1188 return 1;
1189 }
1190 1203
1191 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1204 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1192 tl_per_task = cpu_avg_load_per_task(this_cpu); 1205 tl_per_task = cpu_avg_load_per_task(this_cpu);
1193 1206
1194 if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || 1207 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
1195 balanced) { 1208 tl_per_task)) {
1196 /* 1209 /*
1197 * This domain has SD_WAKE_AFFINE and 1210 * This domain has SD_WAKE_AFFINE and
1198 * p is cache cold in this domain, and 1211 * p is cache cold in this domain, and
@@ -1211,16 +1224,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1211 struct sched_domain *sd, *this_sd = NULL; 1224 struct sched_domain *sd, *this_sd = NULL;
1212 int prev_cpu, this_cpu, new_cpu; 1225 int prev_cpu, this_cpu, new_cpu;
1213 unsigned long load, this_load; 1226 unsigned long load, this_load;
1214 struct rq *rq, *this_rq; 1227 struct rq *this_rq;
1215 unsigned int imbalance; 1228 unsigned int imbalance;
1216 int idx; 1229 int idx;
1217 1230
1218 prev_cpu = task_cpu(p); 1231 prev_cpu = task_cpu(p);
1219 rq = task_rq(p);
1220 this_cpu = smp_processor_id(); 1232 this_cpu = smp_processor_id();
1221 this_rq = cpu_rq(this_cpu); 1233 this_rq = cpu_rq(this_cpu);
1222 new_cpu = prev_cpu; 1234 new_cpu = prev_cpu;
1223 1235
1236 if (prev_cpu == this_cpu)
1237 goto out;
1224 /* 1238 /*
1225 * 'this_sd' is the first domain that both 1239 * 'this_sd' is the first domain that both
1226 * this_cpu and prev_cpu are present in: 1240 * this_cpu and prev_cpu are present in:
@@ -1248,13 +1262,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1248 load = source_load(prev_cpu, idx); 1262 load = source_load(prev_cpu, idx);
1249 this_load = target_load(this_cpu, idx); 1263 this_load = target_load(this_cpu, idx);
1250 1264
1251 if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, 1265 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
1252 load, this_load, imbalance)) 1266 load, this_load, imbalance))
1253 return this_cpu; 1267 return this_cpu;
1254 1268
1255 if (prev_cpu == this_cpu)
1256 goto out;
1257
1258 /* 1269 /*
1259 * Start passive balancing when half the imbalance_pct 1270 * Start passive balancing when half the imbalance_pct
1260 * limit is reached. 1271 * limit is reached.
@@ -1280,9 +1291,7 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1280 * More easily preempt - nice tasks, while not making it harder for 1291 * More easily preempt - nice tasks, while not making it harder for
1281 * + nice tasks. 1292 * + nice tasks.
1282 */ 1293 */
1283 if (sched_feat(ASYM_GRAN)) 1294 if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD)
1284 gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
1285 else
1286 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); 1295 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
1287 1296
1288 return gran; 1297 return gran;
@@ -1307,7 +1316,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1307{ 1316{
1308 s64 gran, vdiff = curr->vruntime - se->vruntime; 1317 s64 gran, vdiff = curr->vruntime - se->vruntime;
1309 1318
1310 if (vdiff < 0) 1319 if (vdiff <= 0)
1311 return -1; 1320 return -1;
1312 1321
1313 gran = wakeup_gran(curr); 1322 gran = wakeup_gran(curr);
@@ -1317,38 +1326,60 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1317 return 0; 1326 return 0;
1318} 1327}
1319 1328
1320/* return depth at which a sched entity is present in the hierarchy */ 1329static void set_last_buddy(struct sched_entity *se)
1321static inline int depth_se(struct sched_entity *se)
1322{ 1330{
1323 int depth = 0;
1324
1325 for_each_sched_entity(se) 1331 for_each_sched_entity(se)
1326 depth++; 1332 cfs_rq_of(se)->last = se;
1333}
1327 1334
1328 return depth; 1335static void set_next_buddy(struct sched_entity *se)
1336{
1337 for_each_sched_entity(se)
1338 cfs_rq_of(se)->next = se;
1329} 1339}
1330 1340
1331/* 1341/*
1332 * Preempt the current task with a newly woken task if needed: 1342 * Preempt the current task with a newly woken task if needed:
1333 */ 1343 */
1334static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) 1344static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1335{ 1345{
1336 struct task_struct *curr = rq->curr; 1346 struct task_struct *curr = rq->curr;
1337 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1338 struct sched_entity *se = &curr->se, *pse = &p->se; 1347 struct sched_entity *se = &curr->se, *pse = &p->se;
1339 int se_depth, pse_depth;
1340 1348
1341 if (unlikely(rt_prio(p->prio))) { 1349 if (unlikely(rt_prio(p->prio))) {
1350 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1351
1342 update_rq_clock(rq); 1352 update_rq_clock(rq);
1343 update_curr(cfs_rq); 1353 update_curr(cfs_rq);
1344 resched_task(curr); 1354 resched_task(curr);
1345 return; 1355 return;
1346 } 1356 }
1347 1357
1358 if (unlikely(p->sched_class != &fair_sched_class))
1359 return;
1360
1348 if (unlikely(se == pse)) 1361 if (unlikely(se == pse))
1349 return; 1362 return;
1350 1363
1351 cfs_rq_of(pse)->next = pse; 1364 /*
1365 * Only set the backward buddy when the current task is still on the
1366 * rq. This can happen when a wakeup gets interleaved with schedule on
1367 * the ->pre_schedule() or idle_balance() point, either of which can
1368 * drop the rq lock.
1369 *
1370 * Also, during early boot the idle thread is in the fair class, for
1371 * obvious reasons its a bad idea to schedule back to the idle thread.
1372 */
1373 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1374 set_last_buddy(se);
1375 set_next_buddy(pse);
1376
1377 /*
1378 * We can come here with TIF_NEED_RESCHED already set from new task
1379 * wake up path.
1380 */
1381 if (test_tsk_need_resched(curr))
1382 return;
1352 1383
1353 /* 1384 /*
1354 * Batch tasks do not preempt (their preemption is driven by 1385 * Batch tasks do not preempt (their preemption is driven by
@@ -1360,34 +1391,26 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1360 if (!sched_feat(WAKEUP_PREEMPT)) 1391 if (!sched_feat(WAKEUP_PREEMPT))
1361 return; 1392 return;
1362 1393
1363 /* 1394 if (sched_feat(WAKEUP_OVERLAP) && (sync ||
1364 * preemption test can be made between sibling entities who are in the 1395 (se->avg_overlap < sysctl_sched_migration_cost &&
1365 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of 1396 pse->avg_overlap < sysctl_sched_migration_cost))) {
1366 * both tasks until we find their ancestors who are siblings of common 1397 resched_task(curr);
1367 * parent. 1398 return;
1368 */ 1399 }
1369 1400
1370 /* First walk up until both entities are at same depth */ 1401 find_matching_se(&se, &pse);
1371 se_depth = depth_se(se);
1372 pse_depth = depth_se(pse);
1373 1402
1374 while (se_depth > pse_depth) { 1403 while (se) {
1375 se_depth--; 1404 BUG_ON(!pse);
1376 se = parent_entity(se);
1377 }
1378 1405
1379 while (pse_depth > se_depth) { 1406 if (wakeup_preempt_entity(se, pse) == 1) {
1380 pse_depth--; 1407 resched_task(curr);
1381 pse = parent_entity(pse); 1408 break;
1382 } 1409 }
1383 1410
1384 while (!is_same_group(se, pse)) {
1385 se = parent_entity(se); 1411 se = parent_entity(se);
1386 pse = parent_entity(pse); 1412 pse = parent_entity(pse);
1387 } 1413 }
1388
1389 if (wakeup_preempt_entity(se, pse) == 1)
1390 resched_task(curr);
1391} 1414}
1392 1415
1393static struct task_struct *pick_next_task_fair(struct rq *rq) 1416static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1401,6 +1424,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1401 1424
1402 do { 1425 do {
1403 se = pick_next_entity(cfs_rq); 1426 se = pick_next_entity(cfs_rq);
1427 set_next_entity(cfs_rq, se);
1404 cfs_rq = group_cfs_rq(se); 1428 cfs_rq = group_cfs_rq(se);
1405 } while (cfs_rq); 1429 } while (cfs_rq);
1406 1430
@@ -1445,19 +1469,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
1445 if (next == &cfs_rq->tasks) 1469 if (next == &cfs_rq->tasks)
1446 return NULL; 1470 return NULL;
1447 1471
1448 /* Skip over entities that are not tasks */ 1472 se = list_entry(next, struct sched_entity, group_node);
1449 do { 1473 p = task_of(se);
1450 se = list_entry(next, struct sched_entity, group_node); 1474 cfs_rq->balance_iterator = next->next;
1451 next = next->next;
1452 } while (next != &cfs_rq->tasks && !entity_is_task(se));
1453
1454 if (next == &cfs_rq->tasks)
1455 return NULL;
1456
1457 cfs_rq->balance_iterator = next;
1458
1459 if (entity_is_task(se))
1460 p = task_of(se);
1461 1475
1462 return p; 1476 return p;
1463} 1477}
@@ -1507,7 +1521,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1507 rcu_read_lock(); 1521 rcu_read_lock();
1508 update_h_load(busiest_cpu); 1522 update_h_load(busiest_cpu);
1509 1523
1510 list_for_each_entry(tg, &task_groups, list) { 1524 list_for_each_entry_rcu(tg, &task_groups, list) {
1511 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; 1525 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
1512 unsigned long busiest_h_load = busiest_cfs_rq->h_load; 1526 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
1513 unsigned long busiest_weight = busiest_cfs_rq->load.weight; 1527 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
@@ -1620,10 +1634,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1620 * 'current' within the tree based on its new key value. 1634 * 'current' within the tree based on its new key value.
1621 */ 1635 */
1622 swap(curr->vruntime, se->vruntime); 1636 swap(curr->vruntime, se->vruntime);
1637 resched_task(rq->curr);
1623 } 1638 }
1624 1639
1625 enqueue_task_fair(rq, p, 0); 1640 enqueue_task_fair(rq, p, 0);
1626 resched_task(rq->curr);
1627} 1641}
1628 1642
1629/* 1643/*
@@ -1642,7 +1656,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p,
1642 if (p->prio > oldprio) 1656 if (p->prio > oldprio)
1643 resched_task(rq->curr); 1657 resched_task(rq->curr);
1644 } else 1658 } else
1645 check_preempt_curr(rq, p); 1659 check_preempt_curr(rq, p, 0);
1646} 1660}
1647 1661
1648/* 1662/*
@@ -1659,7 +1673,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p,
1659 if (running) 1673 if (running)
1660 resched_task(rq->curr); 1674 resched_task(rq->curr);
1661 else 1675 else
1662 check_preempt_curr(rq, p); 1676 check_preempt_curr(rq, p, 0);
1663} 1677}
1664 1678
1665/* Account for a task changing its policy or group. 1679/* Account for a task changing its policy or group.
@@ -1693,9 +1707,6 @@ static const struct sched_class fair_sched_class = {
1693 .enqueue_task = enqueue_task_fair, 1707 .enqueue_task = enqueue_task_fair,
1694 .dequeue_task = dequeue_task_fair, 1708 .dequeue_task = dequeue_task_fair,
1695 .yield_task = yield_task_fair, 1709 .yield_task = yield_task_fair,
1696#ifdef CONFIG_SMP
1697 .select_task_rq = select_task_rq_fair,
1698#endif /* CONFIG_SMP */
1699 1710
1700 .check_preempt_curr = check_preempt_wakeup, 1711 .check_preempt_curr = check_preempt_wakeup,
1701 1712
@@ -1703,6 +1714,8 @@ static const struct sched_class fair_sched_class = {
1703 .put_prev_task = put_prev_task_fair, 1714 .put_prev_task = put_prev_task_fair,
1704 1715
1705#ifdef CONFIG_SMP 1716#ifdef CONFIG_SMP
1717 .select_task_rq = select_task_rq_fair,
1718
1706 .load_balance = load_balance_fair, 1719 .load_balance = load_balance_fair,
1707 .move_one_task = move_one_task_fair, 1720 .move_one_task = move_one_task_fair,
1708#endif 1721#endif