aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
authorAndrea Bastoni <bastoni@cs.unc.edu>2011-08-27 09:43:54 -0400
committerAndrea Bastoni <bastoni@cs.unc.edu>2011-08-27 10:06:11 -0400
commit7b1bb388bc879ffcc6c69b567816d5c354afe42b (patch)
tree5a217fdfb0b5e5a327bdcd624506337c1ae1fe32 /kernel/sched_fair.c
parent7d754596756240fa918b94cd0c3011c77a638987 (diff)
parent02f8c6aee8df3cdc935e9bdd4f2d020306035dbe (diff)
Merge 'Linux v3.0' into Litmus
Some notes: * Litmus^RT scheduling class is the topmost scheduling class (above stop_sched_class). * scheduler_ipi() function (e.g., in smp_reschedule_interrupt()) may increase IPI latencies. * Added path into schedule() to quickly re-evaluate scheduling decision without becoming preemptive again. This used to be a standard path before the removal of BKL. Conflicts: Makefile arch/arm/kernel/calls.S arch/arm/kernel/smp.c arch/x86/include/asm/unistd_32.h arch/x86/kernel/smp.c arch/x86/kernel/syscall_table_32.S include/linux/hrtimer.h kernel/printk.c kernel/sched.c kernel/sched_fair.c
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c1084
1 files changed, 761 insertions, 323 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e0e8d5ca3c98..334eb474af93 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -22,10 +22,11 @@
22 22
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/cpumask.h>
25 26
26/* 27/*
27 * Targeted preemption latency for CPU-bound tasks: 28 * Targeted preemption latency for CPU-bound tasks:
28 * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) 29 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
29 * 30 *
30 * NOTE: this latency value is not the same as the concept of 31 * NOTE: this latency value is not the same as the concept of
31 * 'timeslice length' - timeslices in CFS are of variable length 32 * 'timeslice length' - timeslices in CFS are of variable length
@@ -52,7 +53,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
52 53
53/* 54/*
54 * Minimal preemption granularity for CPU-bound tasks: 55 * Minimal preemption granularity for CPU-bound tasks:
55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) 56 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
56 */ 57 */
57unsigned int sysctl_sched_min_granularity = 750000ULL; 58unsigned int sysctl_sched_min_granularity = 750000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; 59unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
@@ -69,14 +70,6 @@ static unsigned int sched_nr_latency = 8;
69unsigned int sysctl_sched_child_runs_first __read_mostly; 70unsigned int sysctl_sched_child_runs_first __read_mostly;
70 71
71/* 72/*
72 * sys_sched_yield() compat mode
73 *
74 * This option switches the agressive yield implementation of the
75 * old scheduler back on.
76 */
77unsigned int __read_mostly sysctl_sched_compat_yield;
78
79/*
80 * SCHED_OTHER wake-up granularity. 73 * SCHED_OTHER wake-up granularity.
81 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 74 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
82 * 75 *
@@ -89,6 +82,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
89 82
90const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 83const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
91 84
85/*
86 * The exponential sliding window over which load is averaged for shares
87 * distribution.
88 * (default: 10msec)
89 */
90unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
91
92static const struct sched_class fair_sched_class; 92static const struct sched_class fair_sched_class;
93 93
94/************************************************************** 94/**************************************************************
@@ -143,6 +143,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
143 return cfs_rq->tg->cfs_rq[this_cpu]; 143 return cfs_rq->tg->cfs_rq[this_cpu];
144} 144}
145 145
146static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
147{
148 if (!cfs_rq->on_list) {
149 /*
150 * Ensure we either appear before our parent (if already
151 * enqueued) or force our parent to appear after us when it is
152 * enqueued. The fact that we always enqueue bottom-up
153 * reduces this to two cases.
154 */
155 if (cfs_rq->tg->parent &&
156 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
157 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
158 &rq_of(cfs_rq)->leaf_cfs_rq_list);
159 } else {
160 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
161 &rq_of(cfs_rq)->leaf_cfs_rq_list);
162 }
163
164 cfs_rq->on_list = 1;
165 }
166}
167
168static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
169{
170 if (cfs_rq->on_list) {
171 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
172 cfs_rq->on_list = 0;
173 }
174}
175
146/* Iterate thr' all leaf cfs_rq's on a runqueue */ 176/* Iterate thr' all leaf cfs_rq's on a runqueue */
147#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 177#define for_each_leaf_cfs_rq(rq, cfs_rq) \
148 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 178 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +276,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
246 return &cpu_rq(this_cpu)->cfs; 276 return &cpu_rq(this_cpu)->cfs;
247} 277}
248 278
279static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
280{
281}
282
283static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
284{
285}
286
249#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 287#define for_each_leaf_cfs_rq(rq, cfs_rq) \
250 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 288 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
251 289
@@ -320,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
320 } 358 }
321 359
322 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); 360 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
361#ifndef CONFIG_64BIT
362 smp_wmb();
363 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
364#endif
323} 365}
324 366
325/* 367/*
@@ -374,7 +416,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
374 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 416 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
375} 417}
376 418
377static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) 419static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
378{ 420{
379 struct rb_node *left = cfs_rq->rb_leftmost; 421 struct rb_node *left = cfs_rq->rb_leftmost;
380 422
@@ -384,6 +426,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
384 return rb_entry(left, struct sched_entity, run_node); 426 return rb_entry(left, struct sched_entity, run_node);
385} 427}
386 428
429static struct sched_entity *__pick_next_entity(struct sched_entity *se)
430{
431 struct rb_node *next = rb_next(&se->run_node);
432
433 if (!next)
434 return NULL;
435
436 return rb_entry(next, struct sched_entity, run_node);
437}
438
439#ifdef CONFIG_SCHED_DEBUG
387static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 440static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
388{ 441{
389 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 442 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -398,7 +451,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
398 * Scheduling class statistics methods: 451 * Scheduling class statistics methods:
399 */ 452 */
400 453
401#ifdef CONFIG_SCHED_DEBUG
402int sched_proc_update_handler(struct ctl_table *table, int write, 454int sched_proc_update_handler(struct ctl_table *table, int write,
403 void __user *buffer, size_t *lenp, 455 void __user *buffer, size_t *lenp,
404 loff_t *ppos) 456 loff_t *ppos)
@@ -417,7 +469,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
417 WRT_SYSCTL(sched_min_granularity); 469 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency); 470 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity); 471 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL 472#undef WRT_SYSCTL
422 473
423 return 0; 474 return 0;
@@ -495,6 +546,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
495 return calc_delta_fair(sched_slice(cfs_rq, se), se); 546 return calc_delta_fair(sched_slice(cfs_rq, se), se);
496} 547}
497 548
549static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
550static void update_cfs_shares(struct cfs_rq *cfs_rq);
551
498/* 552/*
499 * Update the current task's runtime statistics. Skip current tasks that 553 * Update the current task's runtime statistics. Skip current tasks that
500 * are not in our scheduling class. 554 * are not in our scheduling class.
@@ -514,12 +568,16 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
514 568
515 curr->vruntime += delta_exec_weighted; 569 curr->vruntime += delta_exec_weighted;
516 update_min_vruntime(cfs_rq); 570 update_min_vruntime(cfs_rq);
571
572#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
573 cfs_rq->load_unacc_exec_time += delta_exec;
574#endif
517} 575}
518 576
519static void update_curr(struct cfs_rq *cfs_rq) 577static void update_curr(struct cfs_rq *cfs_rq)
520{ 578{
521 struct sched_entity *curr = cfs_rq->curr; 579 struct sched_entity *curr = cfs_rq->curr;
522 u64 now = rq_of(cfs_rq)->clock; 580 u64 now = rq_of(cfs_rq)->clock_task;
523 unsigned long delta_exec; 581 unsigned long delta_exec;
524 582
525 if (unlikely(!curr)) 583 if (unlikely(!curr))
@@ -602,7 +660,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
602 /* 660 /*
603 * We are starting a new run period: 661 * We are starting a new run period:
604 */ 662 */
605 se->exec_start = rq_of(cfs_rq)->clock; 663 se->exec_start = rq_of(cfs_rq)->clock_task;
606} 664}
607 665
608/************************************************** 666/**************************************************
@@ -633,7 +691,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 list_add(&se->group_node, &cfs_rq->tasks); 691 list_add(&se->group_node, &cfs_rq->tasks);
634 } 692 }
635 cfs_rq->nr_running++; 693 cfs_rq->nr_running++;
636 se->on_rq = 1;
637} 694}
638 695
639static void 696static void
@@ -647,9 +704,164 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
647 list_del_init(&se->group_node); 704 list_del_init(&se->group_node);
648 } 705 }
649 cfs_rq->nr_running--; 706 cfs_rq->nr_running--;
650 se->on_rq = 0;
651} 707}
652 708
709#ifdef CONFIG_FAIR_GROUP_SCHED
710# ifdef CONFIG_SMP
711static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
712 int global_update)
713{
714 struct task_group *tg = cfs_rq->tg;
715 long load_avg;
716
717 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
718 load_avg -= cfs_rq->load_contribution;
719
720 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
721 atomic_add(load_avg, &tg->load_weight);
722 cfs_rq->load_contribution += load_avg;
723 }
724}
725
726static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
727{
728 u64 period = sysctl_sched_shares_window;
729 u64 now, delta;
730 unsigned long load = cfs_rq->load.weight;
731
732 if (cfs_rq->tg == &root_task_group)
733 return;
734
735 now = rq_of(cfs_rq)->clock_task;
736 delta = now - cfs_rq->load_stamp;
737
738 /* truncate load history at 4 idle periods */
739 if (cfs_rq->load_stamp > cfs_rq->load_last &&
740 now - cfs_rq->load_last > 4 * period) {
741 cfs_rq->load_period = 0;
742 cfs_rq->load_avg = 0;
743 delta = period - 1;
744 }
745
746 cfs_rq->load_stamp = now;
747 cfs_rq->load_unacc_exec_time = 0;
748 cfs_rq->load_period += delta;
749 if (load) {
750 cfs_rq->load_last = now;
751 cfs_rq->load_avg += delta * load;
752 }
753
754 /* consider updating load contribution on each fold or truncate */
755 if (global_update || cfs_rq->load_period > period
756 || !cfs_rq->load_period)
757 update_cfs_rq_load_contribution(cfs_rq, global_update);
758
759 while (cfs_rq->load_period > period) {
760 /*
761 * Inline assembly required to prevent the compiler
762 * optimising this loop into a divmod call.
763 * See __iter_div_u64_rem() for another example of this.
764 */
765 asm("" : "+rm" (cfs_rq->load_period));
766 cfs_rq->load_period /= 2;
767 cfs_rq->load_avg /= 2;
768 }
769
770 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
771 list_del_leaf_cfs_rq(cfs_rq);
772}
773
774static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
775{
776 long load_weight, load, shares;
777
778 load = cfs_rq->load.weight;
779
780 load_weight = atomic_read(&tg->load_weight);
781 load_weight += load;
782 load_weight -= cfs_rq->load_contribution;
783
784 shares = (tg->shares * load);
785 if (load_weight)
786 shares /= load_weight;
787
788 if (shares < MIN_SHARES)
789 shares = MIN_SHARES;
790 if (shares > tg->shares)
791 shares = tg->shares;
792
793 return shares;
794}
795
796static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
797{
798 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
799 update_cfs_load(cfs_rq, 0);
800 update_cfs_shares(cfs_rq);
801 }
802}
803# else /* CONFIG_SMP */
804static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
805{
806}
807
808static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
809{
810 return tg->shares;
811}
812
813static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
814{
815}
816# endif /* CONFIG_SMP */
817static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
818 unsigned long weight)
819{
820 if (se->on_rq) {
821 /* commit outstanding execution time */
822 if (cfs_rq->curr == se)
823 update_curr(cfs_rq);
824 account_entity_dequeue(cfs_rq, se);
825 }
826
827 update_load_set(&se->load, weight);
828
829 if (se->on_rq)
830 account_entity_enqueue(cfs_rq, se);
831}
832
833static void update_cfs_shares(struct cfs_rq *cfs_rq)
834{
835 struct task_group *tg;
836 struct sched_entity *se;
837 long shares;
838
839 tg = cfs_rq->tg;
840 se = tg->se[cpu_of(rq_of(cfs_rq))];
841 if (!se)
842 return;
843#ifndef CONFIG_SMP
844 if (likely(se->load.weight == tg->shares))
845 return;
846#endif
847 shares = calc_cfs_shares(cfs_rq, tg);
848
849 reweight_entity(cfs_rq_of(se), se, shares);
850}
851#else /* CONFIG_FAIR_GROUP_SCHED */
852static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
853{
854}
855
856static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
857{
858}
859
860static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
861{
862}
863#endif /* CONFIG_FAIR_GROUP_SCHED */
864
653static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 865static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
654{ 866{
655#ifdef CONFIG_SCHEDSTATS 867#ifdef CONFIG_SCHEDSTATS
@@ -771,7 +983,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
771 * Update run-time statistics of the 'current'. 983 * Update run-time statistics of the 'current'.
772 */ 984 */
773 update_curr(cfs_rq); 985 update_curr(cfs_rq);
986 update_cfs_load(cfs_rq, 0);
774 account_entity_enqueue(cfs_rq, se); 987 account_entity_enqueue(cfs_rq, se);
988 update_cfs_shares(cfs_rq);
775 989
776 if (flags & ENQUEUE_WAKEUP) { 990 if (flags & ENQUEUE_WAKEUP) {
777 place_entity(cfs_rq, se, 0); 991 place_entity(cfs_rq, se, 0);
@@ -782,21 +996,55 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
782 check_spread(cfs_rq, se); 996 check_spread(cfs_rq, se);
783 if (se != cfs_rq->curr) 997 if (se != cfs_rq->curr)
784 __enqueue_entity(cfs_rq, se); 998 __enqueue_entity(cfs_rq, se);
999 se->on_rq = 1;
1000
1001 if (cfs_rq->nr_running == 1)
1002 list_add_leaf_cfs_rq(cfs_rq);
1003}
1004
1005static void __clear_buddies_last(struct sched_entity *se)
1006{
1007 for_each_sched_entity(se) {
1008 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1009 if (cfs_rq->last == se)
1010 cfs_rq->last = NULL;
1011 else
1012 break;
1013 }
785} 1014}
786 1015
787static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 1016static void __clear_buddies_next(struct sched_entity *se)
788{ 1017{
789 if (!se || cfs_rq->last == se) 1018 for_each_sched_entity(se) {
790 cfs_rq->last = NULL; 1019 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1020 if (cfs_rq->next == se)
1021 cfs_rq->next = NULL;
1022 else
1023 break;
1024 }
1025}
791 1026
792 if (!se || cfs_rq->next == se) 1027static void __clear_buddies_skip(struct sched_entity *se)
793 cfs_rq->next = NULL; 1028{
1029 for_each_sched_entity(se) {
1030 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1031 if (cfs_rq->skip == se)
1032 cfs_rq->skip = NULL;
1033 else
1034 break;
1035 }
794} 1036}
795 1037
796static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 1038static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
797{ 1039{
798 for_each_sched_entity(se) 1040 if (cfs_rq->last == se)
799 __clear_buddies(cfs_rq_of(se), se); 1041 __clear_buddies_last(se);
1042
1043 if (cfs_rq->next == se)
1044 __clear_buddies_next(se);
1045
1046 if (cfs_rq->skip == se)
1047 __clear_buddies_skip(se);
800} 1048}
801 1049
802static void 1050static void
@@ -825,8 +1073,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
825 1073
826 if (se != cfs_rq->curr) 1074 if (se != cfs_rq->curr)
827 __dequeue_entity(cfs_rq, se); 1075 __dequeue_entity(cfs_rq, se);
1076 se->on_rq = 0;
1077 update_cfs_load(cfs_rq, 0);
828 account_entity_dequeue(cfs_rq, se); 1078 account_entity_dequeue(cfs_rq, se);
829 update_min_vruntime(cfs_rq);
830 1079
831 /* 1080 /*
832 * Normalize the entity after updating the min_vruntime because the 1081 * Normalize the entity after updating the min_vruntime because the
@@ -835,6 +1084,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
835 */ 1084 */
836 if (!(flags & DEQUEUE_SLEEP)) 1085 if (!(flags & DEQUEUE_SLEEP))
837 se->vruntime -= cfs_rq->min_vruntime; 1086 se->vruntime -= cfs_rq->min_vruntime;
1087
1088 update_min_vruntime(cfs_rq);
1089 update_cfs_shares(cfs_rq);
838} 1090}
839 1091
840/* 1092/*
@@ -869,9 +1121,12 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
869 return; 1121 return;
870 1122
871 if (cfs_rq->nr_running > 1) { 1123 if (cfs_rq->nr_running > 1) {
872 struct sched_entity *se = __pick_next_entity(cfs_rq); 1124 struct sched_entity *se = __pick_first_entity(cfs_rq);
873 s64 delta = curr->vruntime - se->vruntime; 1125 s64 delta = curr->vruntime - se->vruntime;
874 1126
1127 if (delta < 0)
1128 return;
1129
875 if (delta > ideal_runtime) 1130 if (delta > ideal_runtime)
876 resched_task(rq_of(cfs_rq)->curr); 1131 resched_task(rq_of(cfs_rq)->curr);
877 } 1132 }
@@ -910,13 +1165,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
910static int 1165static int
911wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); 1166wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
912 1167
1168/*
1169 * Pick the next process, keeping these things in mind, in this order:
1170 * 1) keep things fair between processes/task groups
1171 * 2) pick the "next" process, since someone really wants that to run
1172 * 3) pick the "last" process, for cache locality
1173 * 4) do not run the "skip" process, if something else is available
1174 */
913static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 1175static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
914{ 1176{
915 struct sched_entity *se = __pick_next_entity(cfs_rq); 1177 struct sched_entity *se = __pick_first_entity(cfs_rq);
916 struct sched_entity *left = se; 1178 struct sched_entity *left = se;
917 1179
918 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) 1180 /*
919 se = cfs_rq->next; 1181 * Avoid running the skip buddy, if running something else can
1182 * be done without getting too unfair.
1183 */
1184 if (cfs_rq->skip == se) {
1185 struct sched_entity *second = __pick_next_entity(se);
1186 if (second && wakeup_preempt_entity(second, left) < 1)
1187 se = second;
1188 }
920 1189
921 /* 1190 /*
922 * Prefer last buddy, try to return the CPU to a preempted task. 1191 * Prefer last buddy, try to return the CPU to a preempted task.
@@ -924,6 +1193,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
924 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) 1193 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
925 se = cfs_rq->last; 1194 se = cfs_rq->last;
926 1195
1196 /*
1197 * Someone really wants this to run. If it's not unfair, run it.
1198 */
1199 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1200 se = cfs_rq->next;
1201
927 clear_buddies(cfs_rq, se); 1202 clear_buddies(cfs_rq, se);
928 1203
929 return se; 1204 return se;
@@ -955,6 +1230,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
955 */ 1230 */
956 update_curr(cfs_rq); 1231 update_curr(cfs_rq);
957 1232
1233 /*
1234 * Update share accounting for long-running entities.
1235 */
1236 update_entity_shares_tick(cfs_rq);
1237
958#ifdef CONFIG_SCHED_HRTICK 1238#ifdef CONFIG_SCHED_HRTICK
959 /* 1239 /*
960 * queued ticks are scheduled to match the slice, so don't bother 1240 * queued ticks are scheduled to match the slice, so don't bother
@@ -1055,9 +1335,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1055 flags = ENQUEUE_WAKEUP; 1335 flags = ENQUEUE_WAKEUP;
1056 } 1336 }
1057 1337
1338 for_each_sched_entity(se) {
1339 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1340
1341 update_cfs_load(cfs_rq, 0);
1342 update_cfs_shares(cfs_rq);
1343 }
1344
1058 hrtick_update(rq); 1345 hrtick_update(rq);
1059} 1346}
1060 1347
1348static void set_next_buddy(struct sched_entity *se);
1349
1061/* 1350/*
1062 * The dequeue_task method is called before nr_running is 1351 * The dequeue_task method is called before nr_running is
1063 * decreased. We remove the task from the rbtree and 1352 * decreased. We remove the task from the rbtree and
@@ -1067,73 +1356,56 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1067{ 1356{
1068 struct cfs_rq *cfs_rq; 1357 struct cfs_rq *cfs_rq;
1069 struct sched_entity *se = &p->se; 1358 struct sched_entity *se = &p->se;
1359 int task_sleep = flags & DEQUEUE_SLEEP;
1070 1360
1071 for_each_sched_entity(se) { 1361 for_each_sched_entity(se) {
1072 cfs_rq = cfs_rq_of(se); 1362 cfs_rq = cfs_rq_of(se);
1073 dequeue_entity(cfs_rq, se, flags); 1363 dequeue_entity(cfs_rq, se, flags);
1364
1074 /* Don't dequeue parent if it has other entities besides us */ 1365 /* Don't dequeue parent if it has other entities besides us */
1075 if (cfs_rq->load.weight) 1366 if (cfs_rq->load.weight) {
1367 /*
1368 * Bias pick_next to pick a task from this cfs_rq, as
1369 * p is sleeping when it is within its sched_slice.
1370 */
1371 if (task_sleep && parent_entity(se))
1372 set_next_buddy(parent_entity(se));
1076 break; 1373 break;
1374 }
1077 flags |= DEQUEUE_SLEEP; 1375 flags |= DEQUEUE_SLEEP;
1078 } 1376 }
1079 1377
1080 hrtick_update(rq); 1378 for_each_sched_entity(se) {
1081} 1379 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1082
1083/*
1084 * sched_yield() support is very simple - we dequeue and enqueue.
1085 *
1086 * If compat_yield is turned on then we requeue to the end of the tree.
1087 */
1088static void yield_task_fair(struct rq *rq)
1089{
1090 struct task_struct *curr = rq->curr;
1091 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1092 struct sched_entity *rightmost, *se = &curr->se;
1093
1094 /*
1095 * Are we the only task in the tree?
1096 */
1097 if (unlikely(cfs_rq->nr_running == 1))
1098 return;
1099
1100 clear_buddies(cfs_rq, se);
1101
1102 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
1103 update_rq_clock(rq);
1104 /*
1105 * Update run-time statistics of the 'current'.
1106 */
1107 update_curr(cfs_rq);
1108 1380
1109 return; 1381 update_cfs_load(cfs_rq, 0);
1382 update_cfs_shares(cfs_rq);
1110 } 1383 }
1111 /*
1112 * Find the rightmost entry in the rbtree:
1113 */
1114 rightmost = __pick_last_entity(cfs_rq);
1115 /*
1116 * Already in the rightmost position?
1117 */
1118 if (unlikely(!rightmost || entity_before(rightmost, se)))
1119 return;
1120 1384
1121 /* 1385 hrtick_update(rq);
1122 * Minimally necessary key value to be last in the tree:
1123 * Upon rescheduling, sched_class::put_prev_task() will place
1124 * 'current' within the tree based on its new key value.
1125 */
1126 se->vruntime = rightmost->vruntime + 1;
1127} 1386}
1128 1387
1129#ifdef CONFIG_SMP 1388#ifdef CONFIG_SMP
1130 1389
1131static void task_waking_fair(struct rq *rq, struct task_struct *p) 1390static void task_waking_fair(struct task_struct *p)
1132{ 1391{
1133 struct sched_entity *se = &p->se; 1392 struct sched_entity *se = &p->se;
1134 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1393 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1394 u64 min_vruntime;
1135 1395
1136 se->vruntime -= cfs_rq->min_vruntime; 1396#ifndef CONFIG_64BIT
1397 u64 min_vruntime_copy;
1398
1399 do {
1400 min_vruntime_copy = cfs_rq->min_vruntime_copy;
1401 smp_rmb();
1402 min_vruntime = cfs_rq->min_vruntime;
1403 } while (min_vruntime != min_vruntime_copy);
1404#else
1405 min_vruntime = cfs_rq->min_vruntime;
1406#endif
1407
1408 se->vruntime -= min_vruntime;
1137} 1409}
1138 1410
1139#ifdef CONFIG_FAIR_GROUP_SCHED 1411#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1143,67 +1415,36 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
1143 * Adding load to a group doesn't make a group heavier, but can cause movement 1415 * Adding load to a group doesn't make a group heavier, but can cause movement
1144 * of group shares between cpus. Assuming the shares were perfectly aligned one 1416 * of group shares between cpus. Assuming the shares were perfectly aligned one
1145 * can calculate the shift in shares. 1417 * can calculate the shift in shares.
1146 *
1147 * The problem is that perfectly aligning the shares is rather expensive, hence
1148 * we try to avoid doing that too often - see update_shares(), which ratelimits
1149 * this change.
1150 *
1151 * We compensate this by not only taking the current delta into account, but
1152 * also considering the delta between when the shares were last adjusted and
1153 * now.
1154 *
1155 * We still saw a performance dip, some tracing learned us that between
1156 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1157 * significantly. Therefore try to bias the error in direction of failing
1158 * the affine wakeup.
1159 *
1160 */ 1418 */
1161static long effective_load(struct task_group *tg, int cpu, 1419static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1162 long wl, long wg)
1163{ 1420{
1164 struct sched_entity *se = tg->se[cpu]; 1421 struct sched_entity *se = tg->se[cpu];
1165 1422
1166 if (!tg->parent) 1423 if (!tg->parent)
1167 return wl; 1424 return wl;
1168 1425
1169 /*
1170 * By not taking the decrease of shares on the other cpu into
1171 * account our error leans towards reducing the affine wakeups.
1172 */
1173 if (!wl && sched_feat(ASYM_EFF_LOAD))
1174 return wl;
1175
1176 for_each_sched_entity(se) { 1426 for_each_sched_entity(se) {
1177 long S, rw, s, a, b; 1427 long lw, w;
1178 long more_w;
1179 1428
1180 /* 1429 tg = se->my_q->tg;
1181 * Instead of using this increment, also add the difference 1430 w = se->my_q->load.weight;
1182 * between when the shares were last updated and now.
1183 */
1184 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1185 wl += more_w;
1186 wg += more_w;
1187 1431
1188 S = se->my_q->tg->shares; 1432 /* use this cpu's instantaneous contribution */
1189 s = se->my_q->shares; 1433 lw = atomic_read(&tg->load_weight);
1190 rw = se->my_q->rq_weight; 1434 lw -= se->my_q->load_contribution;
1435 lw += w + wg;
1191 1436
1192 a = S*(rw + wl); 1437 wl += w;
1193 b = S*rw + s*wg;
1194 1438
1195 wl = s*(a-b); 1439 if (lw > 0 && wl < lw)
1196 1440 wl = (wl * tg->shares) / lw;
1197 if (likely(b)) 1441 else
1198 wl /= b; 1442 wl = tg->shares;
1199 1443
1200 /* 1444 /* zero point is MIN_SHARES */
1201 * Assume the group is already running and will 1445 if (wl < MIN_SHARES)
1202 * thus already be accounted for in the weight. 1446 wl = MIN_SHARES;
1203 * 1447 wl -= se->load.weight;
1204 * That is, moving shares between CPUs, does not
1205 * alter the group weight.
1206 */
1207 wg = 0; 1448 wg = 0;
1208 } 1449 }
1209 1450
@@ -1222,7 +1463,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1222 1463
1223static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 1464static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1224{ 1465{
1225 unsigned long this_load, load; 1466 s64 this_load, load;
1226 int idx, this_cpu, prev_cpu; 1467 int idx, this_cpu, prev_cpu;
1227 unsigned long tl_per_task; 1468 unsigned long tl_per_task;
1228 struct task_group *tg; 1469 struct task_group *tg;
@@ -1261,8 +1502,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1261 * Otherwise check if either cpus are near enough in load to allow this 1502 * Otherwise check if either cpus are near enough in load to allow this
1262 * task to be woken on this_cpu. 1503 * task to be woken on this_cpu.
1263 */ 1504 */
1264 if (this_load) { 1505 if (this_load > 0) {
1265 unsigned long this_eff_load, prev_eff_load; 1506 s64 this_eff_load, prev_eff_load;
1266 1507
1267 this_eff_load = 100; 1508 this_eff_load = 100;
1268 this_eff_load *= power_of(prev_cpu); 1509 this_eff_load *= power_of(prev_cpu);
@@ -1344,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1344 } 1585 }
1345 1586
1346 /* Adjust by relative CPU power of the group */ 1587 /* Adjust by relative CPU power of the group */
1347 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 1588 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
1348 1589
1349 if (local_group) { 1590 if (local_group) {
1350 this_load = avg_load; 1591 this_load = avg_load;
@@ -1409,6 +1650,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1409 /* 1650 /*
1410 * Otherwise, iterate the domains and find an elegible idle cpu. 1651 * Otherwise, iterate the domains and find an elegible idle cpu.
1411 */ 1652 */
1653 rcu_read_lock();
1412 for_each_domain(target, sd) { 1654 for_each_domain(target, sd) {
1413 if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) 1655 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1414 break; 1656 break;
@@ -1428,6 +1670,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1428 cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) 1670 cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
1429 break; 1671 break;
1430 } 1672 }
1673 rcu_read_unlock();
1431 1674
1432 return target; 1675 return target;
1433} 1676}
@@ -1444,7 +1687,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
1444 * preempt must be disabled. 1687 * preempt must be disabled.
1445 */ 1688 */
1446static int 1689static int
1447select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) 1690select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1448{ 1691{
1449 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1692 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1450 int cpu = smp_processor_id(); 1693 int cpu = smp_processor_id();
@@ -1460,6 +1703,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1460 new_cpu = prev_cpu; 1703 new_cpu = prev_cpu;
1461 } 1704 }
1462 1705
1706 rcu_read_lock();
1463 for_each_domain(cpu, tmp) { 1707 for_each_domain(cpu, tmp) {
1464 if (!(tmp->flags & SD_LOAD_BALANCE)) 1708 if (!(tmp->flags & SD_LOAD_BALANCE))
1465 continue; 1709 continue;
@@ -1479,7 +1723,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1479 nr_running += cpu_rq(i)->cfs.nr_running; 1723 nr_running += cpu_rq(i)->cfs.nr_running;
1480 } 1724 }
1481 1725
1482 capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 1726 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
1483 1727
1484 if (tmp->flags & SD_POWERSAVINGS_BALANCE) 1728 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1485 nr_running /= 2; 1729 nr_running /= 2;
@@ -1508,28 +1752,12 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1508 sd = tmp; 1752 sd = tmp;
1509 } 1753 }
1510 1754
1511#ifdef CONFIG_FAIR_GROUP_SCHED
1512 if (sched_feat(LB_SHARES_UPDATE)) {
1513 /*
1514 * Pick the largest domain to update shares over
1515 */
1516 tmp = sd;
1517 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1518 tmp = affine_sd;
1519
1520 if (tmp) {
1521 raw_spin_unlock(&rq->lock);
1522 update_shares(tmp);
1523 raw_spin_lock(&rq->lock);
1524 }
1525 }
1526#endif
1527
1528 if (affine_sd) { 1755 if (affine_sd) {
1529 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1756 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1530 return select_idle_sibling(p, cpu); 1757 prev_cpu = cpu;
1531 else 1758
1532 return select_idle_sibling(p, prev_cpu); 1759 new_cpu = select_idle_sibling(p, prev_cpu);
1760 goto unlock;
1533 } 1761 }
1534 1762
1535 while (sd) { 1763 while (sd) {
@@ -1570,6 +1798,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1570 } 1798 }
1571 /* while loop will break here if sd == NULL */ 1799 /* while loop will break here if sd == NULL */
1572 } 1800 }
1801unlock:
1802 rcu_read_unlock();
1573 1803
1574 return new_cpu; 1804 return new_cpu;
1575} 1805}
@@ -1593,10 +1823,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1593 * This is especially important for buddies when the leftmost 1823 * This is especially important for buddies when the leftmost
1594 * task is higher priority than the buddy. 1824 * task is higher priority than the buddy.
1595 */ 1825 */
1596 if (unlikely(se->load.weight != NICE_0_LOAD)) 1826 return calc_delta_fair(gran, se);
1597 gran = calc_delta_fair(gran, se);
1598
1599 return gran;
1600} 1827}
1601 1828
1602/* 1829/*
@@ -1630,18 +1857,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1630 1857
1631static void set_last_buddy(struct sched_entity *se) 1858static void set_last_buddy(struct sched_entity *se)
1632{ 1859{
1633 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1860 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
1634 for_each_sched_entity(se) 1861 return;
1635 cfs_rq_of(se)->last = se; 1862
1636 } 1863 for_each_sched_entity(se)
1864 cfs_rq_of(se)->last = se;
1637} 1865}
1638 1866
1639static void set_next_buddy(struct sched_entity *se) 1867static void set_next_buddy(struct sched_entity *se)
1640{ 1868{
1641 if (likely(task_of(se)->policy != SCHED_IDLE)) { 1869 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
1642 for_each_sched_entity(se) 1870 return;
1643 cfs_rq_of(se)->next = se; 1871
1644 } 1872 for_each_sched_entity(se)
1873 cfs_rq_of(se)->next = se;
1874}
1875
1876static void set_skip_buddy(struct sched_entity *se)
1877{
1878 for_each_sched_entity(se)
1879 cfs_rq_of(se)->skip = se;
1645} 1880}
1646 1881
1647/* 1882/*
@@ -1653,18 +1888,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1653 struct sched_entity *se = &curr->se, *pse = &p->se; 1888 struct sched_entity *se = &curr->se, *pse = &p->se;
1654 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1889 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1655 int scale = cfs_rq->nr_running >= sched_nr_latency; 1890 int scale = cfs_rq->nr_running >= sched_nr_latency;
1891 int next_buddy_marked = 0;
1656 1892
1657 if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS) 1893 if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS)
1658 goto preempt; 1894 goto preempt;
1659 1895
1660 if (unlikely(p->sched_class != &fair_sched_class))
1661 return;
1662
1663 if (unlikely(se == pse)) 1896 if (unlikely(se == pse))
1664 return; 1897 return;
1665 1898
1666 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) 1899 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
1667 set_next_buddy(pse); 1900 set_next_buddy(pse);
1901 next_buddy_marked = 1;
1902 }
1668 1903
1669 /* 1904 /*
1670 * We can come here with TIF_NEED_RESCHED already set from new task 1905 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1673,16 +1908,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1673 if (test_tsk_need_resched(curr)) 1908 if (test_tsk_need_resched(curr))
1674 return; 1909 return;
1675 1910
1911 /* Idle tasks are by definition preempted by non-idle tasks. */
1912 if (unlikely(curr->policy == SCHED_IDLE) &&
1913 likely(p->policy != SCHED_IDLE))
1914 goto preempt;
1915
1676 /* 1916 /*
1677 * Batch and idle tasks do not preempt (their preemption is driven by 1917 * Batch and idle tasks do not preempt non-idle tasks (their preemption
1678 * the tick): 1918 * is driven by the tick):
1679 */ 1919 */
1680 if (unlikely(p->policy != SCHED_NORMAL)) 1920 if (unlikely(p->policy != SCHED_NORMAL))
1681 return; 1921 return;
1682 1922
1683 /* Idle tasks are by definition preempted by everybody. */
1684 if (unlikely(curr->policy == SCHED_IDLE))
1685 goto preempt;
1686 1923
1687 if (!sched_feat(WAKEUP_PREEMPT)) 1924 if (!sched_feat(WAKEUP_PREEMPT))
1688 return; 1925 return;
@@ -1690,8 +1927,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1690 update_curr(cfs_rq); 1927 update_curr(cfs_rq);
1691 find_matching_se(&se, &pse); 1928 find_matching_se(&se, &pse);
1692 BUG_ON(!pse); 1929 BUG_ON(!pse);
1693 if (wakeup_preempt_entity(se, pse) == 1) 1930 if (wakeup_preempt_entity(se, pse) == 1) {
1931 /*
1932 * Bias pick_next to pick the sched entity that is
1933 * triggering this preemption.
1934 */
1935 if (!next_buddy_marked)
1936 set_next_buddy(pse);
1694 goto preempt; 1937 goto preempt;
1938 }
1695 1939
1696 return; 1940 return;
1697 1941
@@ -1748,6 +1992,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1748 } 1992 }
1749} 1993}
1750 1994
1995/*
1996 * sched_yield() is very simple
1997 *
1998 * The magic of dealing with the ->skip buddy is in pick_next_entity.
1999 */
2000static void yield_task_fair(struct rq *rq)
2001{
2002 struct task_struct *curr = rq->curr;
2003 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
2004 struct sched_entity *se = &curr->se;
2005
2006 /*
2007 * Are we the only task in the tree?
2008 */
2009 if (unlikely(rq->nr_running == 1))
2010 return;
2011
2012 clear_buddies(cfs_rq, se);
2013
2014 if (curr->policy != SCHED_BATCH) {
2015 update_rq_clock(rq);
2016 /*
2017 * Update run-time statistics of the 'current'.
2018 */
2019 update_curr(cfs_rq);
2020 }
2021
2022 set_skip_buddy(se);
2023}
2024
2025static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
2026{
2027 struct sched_entity *se = &p->se;
2028
2029 if (!se->on_rq)
2030 return false;
2031
2032 /* Tell the scheduler that we'd really like pse to run next. */
2033 set_next_buddy(se);
2034
2035 yield_task_fair(rq);
2036
2037 return true;
2038}
2039
1751#ifdef CONFIG_SMP 2040#ifdef CONFIG_SMP
1752/************************************************** 2041/**************************************************
1753 * Fair scheduling class load-balancing methods: 2042 * Fair scheduling class load-balancing methods:
@@ -1798,7 +2087,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1798 * 2) too many balance attempts have failed. 2087 * 2) too many balance attempts have failed.
1799 */ 2088 */
1800 2089
1801 tsk_cache_hot = task_hot(p, rq->clock, sd); 2090 tsk_cache_hot = task_hot(p, rq->clock_task, sd);
1802 if (!tsk_cache_hot || 2091 if (!tsk_cache_hot ||
1803 sd->nr_balance_failed > sd->cache_nice_tries) { 2092 sd->nr_balance_failed > sd->cache_nice_tries) {
1804#ifdef CONFIG_SCHEDSTATS 2093#ifdef CONFIG_SCHEDSTATS
@@ -1857,23 +2146,22 @@ static unsigned long
1857balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 2146balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1858 unsigned long max_load_move, struct sched_domain *sd, 2147 unsigned long max_load_move, struct sched_domain *sd,
1859 enum cpu_idle_type idle, int *all_pinned, 2148 enum cpu_idle_type idle, int *all_pinned,
1860 int *this_best_prio, struct cfs_rq *busiest_cfs_rq) 2149 struct cfs_rq *busiest_cfs_rq)
1861{ 2150{
1862 int loops = 0, pulled = 0, pinned = 0; 2151 int loops = 0, pulled = 0;
1863 long rem_load_move = max_load_move; 2152 long rem_load_move = max_load_move;
1864 struct task_struct *p, *n; 2153 struct task_struct *p, *n;
1865 2154
1866 if (max_load_move == 0) 2155 if (max_load_move == 0)
1867 goto out; 2156 goto out;
1868 2157
1869 pinned = 1;
1870
1871 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { 2158 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
1872 if (loops++ > sysctl_sched_nr_migrate) 2159 if (loops++ > sysctl_sched_nr_migrate)
1873 break; 2160 break;
1874 2161
1875 if ((p->se.load.weight >> 1) > rem_load_move || 2162 if ((p->se.load.weight >> 1) > rem_load_move ||
1876 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) 2163 !can_migrate_task(p, busiest, this_cpu, sd, idle,
2164 all_pinned))
1877 continue; 2165 continue;
1878 2166
1879 pull_task(busiest, p, this_rq, this_cpu); 2167 pull_task(busiest, p, this_rq, this_cpu);
@@ -1896,9 +2184,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1896 */ 2184 */
1897 if (rem_load_move <= 0) 2185 if (rem_load_move <= 0)
1898 break; 2186 break;
1899
1900 if (p->prio < *this_best_prio)
1901 *this_best_prio = p->prio;
1902 } 2187 }
1903out: 2188out:
1904 /* 2189 /*
@@ -1908,18 +2193,57 @@ out:
1908 */ 2193 */
1909 schedstat_add(sd, lb_gained[idle], pulled); 2194 schedstat_add(sd, lb_gained[idle], pulled);
1910 2195
1911 if (all_pinned)
1912 *all_pinned = pinned;
1913
1914 return max_load_move - rem_load_move; 2196 return max_load_move - rem_load_move;
1915} 2197}
1916 2198
1917#ifdef CONFIG_FAIR_GROUP_SCHED 2199#ifdef CONFIG_FAIR_GROUP_SCHED
2200/*
2201 * update tg->load_weight by folding this cpu's load_avg
2202 */
2203static int update_shares_cpu(struct task_group *tg, int cpu)
2204{
2205 struct cfs_rq *cfs_rq;
2206 unsigned long flags;
2207 struct rq *rq;
2208
2209 if (!tg->se[cpu])
2210 return 0;
2211
2212 rq = cpu_rq(cpu);
2213 cfs_rq = tg->cfs_rq[cpu];
2214
2215 raw_spin_lock_irqsave(&rq->lock, flags);
2216
2217 update_rq_clock(rq);
2218 update_cfs_load(cfs_rq, 1);
2219
2220 /*
2221 * We need to update shares after updating tg->load_weight in
2222 * order to adjust the weight of groups with long running tasks.
2223 */
2224 update_cfs_shares(cfs_rq);
2225
2226 raw_spin_unlock_irqrestore(&rq->lock, flags);
2227
2228 return 0;
2229}
2230
2231static void update_shares(int cpu)
2232{
2233 struct cfs_rq *cfs_rq;
2234 struct rq *rq = cpu_rq(cpu);
2235
2236 rcu_read_lock();
2237 for_each_leaf_cfs_rq(rq, cfs_rq)
2238 update_shares_cpu(cfs_rq->tg, cpu);
2239 rcu_read_unlock();
2240}
2241
1918static unsigned long 2242static unsigned long
1919load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2243load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1920 unsigned long max_load_move, 2244 unsigned long max_load_move,
1921 struct sched_domain *sd, enum cpu_idle_type idle, 2245 struct sched_domain *sd, enum cpu_idle_type idle,
1922 int *all_pinned, int *this_best_prio) 2246 int *all_pinned)
1923{ 2247{
1924 long rem_load_move = max_load_move; 2248 long rem_load_move = max_load_move;
1925 int busiest_cpu = cpu_of(busiest); 2249 int busiest_cpu = cpu_of(busiest);
@@ -1944,7 +2268,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1944 rem_load = div_u64(rem_load, busiest_h_load + 1); 2268 rem_load = div_u64(rem_load, busiest_h_load + 1);
1945 2269
1946 moved_load = balance_tasks(this_rq, this_cpu, busiest, 2270 moved_load = balance_tasks(this_rq, this_cpu, busiest,
1947 rem_load, sd, idle, all_pinned, this_best_prio, 2271 rem_load, sd, idle, all_pinned,
1948 busiest_cfs_rq); 2272 busiest_cfs_rq);
1949 2273
1950 if (!moved_load) 2274 if (!moved_load)
@@ -1962,15 +2286,19 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1962 return max_load_move - rem_load_move; 2286 return max_load_move - rem_load_move;
1963} 2287}
1964#else 2288#else
2289static inline void update_shares(int cpu)
2290{
2291}
2292
1965static unsigned long 2293static unsigned long
1966load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2294load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1967 unsigned long max_load_move, 2295 unsigned long max_load_move,
1968 struct sched_domain *sd, enum cpu_idle_type idle, 2296 struct sched_domain *sd, enum cpu_idle_type idle,
1969 int *all_pinned, int *this_best_prio) 2297 int *all_pinned)
1970{ 2298{
1971 return balance_tasks(this_rq, this_cpu, busiest, 2299 return balance_tasks(this_rq, this_cpu, busiest,
1972 max_load_move, sd, idle, all_pinned, 2300 max_load_move, sd, idle, all_pinned,
1973 this_best_prio, &busiest->cfs); 2301 &busiest->cfs);
1974} 2302}
1975#endif 2303#endif
1976 2304
@@ -1987,12 +2315,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1987 int *all_pinned) 2315 int *all_pinned)
1988{ 2316{
1989 unsigned long total_load_moved = 0, load_moved; 2317 unsigned long total_load_moved = 0, load_moved;
1990 int this_best_prio = this_rq->curr->prio;
1991 2318
1992 do { 2319 do {
1993 load_moved = load_balance_fair(this_rq, this_cpu, busiest, 2320 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
1994 max_load_move - total_load_moved, 2321 max_load_move - total_load_moved,
1995 sd, idle, all_pinned, &this_best_prio); 2322 sd, idle, all_pinned);
1996 2323
1997 total_load_moved += load_moved; 2324 total_load_moved += load_moved;
1998 2325
@@ -2030,12 +2357,17 @@ struct sd_lb_stats {
2030 unsigned long this_load; 2357 unsigned long this_load;
2031 unsigned long this_load_per_task; 2358 unsigned long this_load_per_task;
2032 unsigned long this_nr_running; 2359 unsigned long this_nr_running;
2360 unsigned long this_has_capacity;
2361 unsigned int this_idle_cpus;
2033 2362
2034 /* Statistics of the busiest group */ 2363 /* Statistics of the busiest group */
2364 unsigned int busiest_idle_cpus;
2035 unsigned long max_load; 2365 unsigned long max_load;
2036 unsigned long busiest_load_per_task; 2366 unsigned long busiest_load_per_task;
2037 unsigned long busiest_nr_running; 2367 unsigned long busiest_nr_running;
2038 unsigned long busiest_group_capacity; 2368 unsigned long busiest_group_capacity;
2369 unsigned long busiest_has_capacity;
2370 unsigned int busiest_group_weight;
2039 2371
2040 int group_imb; /* Is there imbalance in this sd */ 2372 int group_imb; /* Is there imbalance in this sd */
2041#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2373#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2057,7 +2389,10 @@ struct sg_lb_stats {
2057 unsigned long sum_nr_running; /* Nr tasks running in the group */ 2389 unsigned long sum_nr_running; /* Nr tasks running in the group */
2058 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 2390 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2059 unsigned long group_capacity; 2391 unsigned long group_capacity;
2392 unsigned long idle_cpus;
2393 unsigned long group_weight;
2060 int group_imb; /* Is there an imbalance in the group ? */ 2394 int group_imb; /* Is there an imbalance in the group ? */
2395 int group_has_capacity; /* Is there extra capacity in the group? */
2061}; 2396};
2062 2397
2063/** 2398/**
@@ -2239,7 +2574,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2239 2574
2240unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 2575unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2241{ 2576{
2242 return SCHED_LOAD_SCALE; 2577 return SCHED_POWER_SCALE;
2243} 2578}
2244 2579
2245unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) 2580unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
@@ -2268,12 +2603,18 @@ unsigned long scale_rt_power(int cpu)
2268 u64 total, available; 2603 u64 total, available;
2269 2604
2270 total = sched_avg_period() + (rq->clock - rq->age_stamp); 2605 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2271 available = total - rq->rt_avg;
2272 2606
2273 if (unlikely((s64)total < SCHED_LOAD_SCALE)) 2607 if (unlikely(total < rq->rt_avg)) {
2274 total = SCHED_LOAD_SCALE; 2608 /* Ensures that power won't end up being negative */
2609 available = 0;
2610 } else {
2611 available = total - rq->rt_avg;
2612 }
2613
2614 if (unlikely((s64)total < SCHED_POWER_SCALE))
2615 total = SCHED_POWER_SCALE;
2275 2616
2276 total >>= SCHED_LOAD_SHIFT; 2617 total >>= SCHED_POWER_SHIFT;
2277 2618
2278 return div_u64(available, total); 2619 return div_u64(available, total);
2279} 2620}
@@ -2281,7 +2622,7 @@ unsigned long scale_rt_power(int cpu)
2281static void update_cpu_power(struct sched_domain *sd, int cpu) 2622static void update_cpu_power(struct sched_domain *sd, int cpu)
2282{ 2623{
2283 unsigned long weight = sd->span_weight; 2624 unsigned long weight = sd->span_weight;
2284 unsigned long power = SCHED_LOAD_SCALE; 2625 unsigned long power = SCHED_POWER_SCALE;
2285 struct sched_group *sdg = sd->groups; 2626 struct sched_group *sdg = sd->groups;
2286 2627
2287 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 2628 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
@@ -2290,26 +2631,26 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2290 else 2631 else
2291 power *= default_scale_smt_power(sd, cpu); 2632 power *= default_scale_smt_power(sd, cpu);
2292 2633
2293 power >>= SCHED_LOAD_SHIFT; 2634 power >>= SCHED_POWER_SHIFT;
2294 } 2635 }
2295 2636
2296 sdg->cpu_power_orig = power; 2637 sdg->sgp->power_orig = power;
2297 2638
2298 if (sched_feat(ARCH_POWER)) 2639 if (sched_feat(ARCH_POWER))
2299 power *= arch_scale_freq_power(sd, cpu); 2640 power *= arch_scale_freq_power(sd, cpu);
2300 else 2641 else
2301 power *= default_scale_freq_power(sd, cpu); 2642 power *= default_scale_freq_power(sd, cpu);
2302 2643
2303 power >>= SCHED_LOAD_SHIFT; 2644 power >>= SCHED_POWER_SHIFT;
2304 2645
2305 power *= scale_rt_power(cpu); 2646 power *= scale_rt_power(cpu);
2306 power >>= SCHED_LOAD_SHIFT; 2647 power >>= SCHED_POWER_SHIFT;
2307 2648
2308 if (!power) 2649 if (!power)
2309 power = 1; 2650 power = 1;
2310 2651
2311 cpu_rq(cpu)->cpu_power = power; 2652 cpu_rq(cpu)->cpu_power = power;
2312 sdg->cpu_power = power; 2653 sdg->sgp->power = power;
2313} 2654}
2314 2655
2315static void update_group_power(struct sched_domain *sd, int cpu) 2656static void update_group_power(struct sched_domain *sd, int cpu)
@@ -2327,11 +2668,11 @@ static void update_group_power(struct sched_domain *sd, int cpu)
2327 2668
2328 group = child->groups; 2669 group = child->groups;
2329 do { 2670 do {
2330 power += group->cpu_power; 2671 power += group->sgp->power;
2331 group = group->next; 2672 group = group->next;
2332 } while (group != child->groups); 2673 } while (group != child->groups);
2333 2674
2334 sdg->cpu_power = power; 2675 sdg->sgp->power = power;
2335} 2676}
2336 2677
2337/* 2678/*
@@ -2345,15 +2686,15 @@ static inline int
2345fix_small_capacity(struct sched_domain *sd, struct sched_group *group) 2686fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2346{ 2687{
2347 /* 2688 /*
2348 * Only siblings can have significantly less than SCHED_LOAD_SCALE 2689 * Only siblings can have significantly less than SCHED_POWER_SCALE
2349 */ 2690 */
2350 if (sd->level != SD_LV_SIBLING) 2691 if (!(sd->flags & SD_SHARE_CPUPOWER))
2351 return 0; 2692 return 0;
2352 2693
2353 /* 2694 /*
2354 * If ~90% of the cpu_power is still there, we're good. 2695 * If ~90% of the cpu_power is still there, we're good.
2355 */ 2696 */
2356 if (group->cpu_power * 32 > group->cpu_power_orig * 29) 2697 if (group->sgp->power * 32 > group->sgp->power_orig * 29)
2357 return 1; 2698 return 1;
2358 2699
2359 return 0; 2700 return 0;
@@ -2366,7 +2707,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2366 * @this_cpu: Cpu for which load balance is currently performed. 2707 * @this_cpu: Cpu for which load balance is currently performed.
2367 * @idle: Idle status of this_cpu 2708 * @idle: Idle status of this_cpu
2368 * @load_idx: Load index of sched_domain of this_cpu for load calc. 2709 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2369 * @sd_idle: Idle status of the sched_domain containing group.
2370 * @local_group: Does group contain this_cpu. 2710 * @local_group: Does group contain this_cpu.
2371 * @cpus: Set of cpus considered for load balancing. 2711 * @cpus: Set of cpus considered for load balancing.
2372 * @balance: Should we balance. 2712 * @balance: Should we balance.
@@ -2374,11 +2714,11 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2374 */ 2714 */
2375static inline void update_sg_lb_stats(struct sched_domain *sd, 2715static inline void update_sg_lb_stats(struct sched_domain *sd,
2376 struct sched_group *group, int this_cpu, 2716 struct sched_group *group, int this_cpu,
2377 enum cpu_idle_type idle, int load_idx, int *sd_idle, 2717 enum cpu_idle_type idle, int load_idx,
2378 int local_group, const struct cpumask *cpus, 2718 int local_group, const struct cpumask *cpus,
2379 int *balance, struct sg_lb_stats *sgs) 2719 int *balance, struct sg_lb_stats *sgs)
2380{ 2720{
2381 unsigned long load, max_cpu_load, min_cpu_load; 2721 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
2382 int i; 2722 int i;
2383 unsigned int balance_cpu = -1, first_idle_cpu = 0; 2723 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2384 unsigned long avg_load_per_task = 0; 2724 unsigned long avg_load_per_task = 0;
@@ -2389,13 +2729,11 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2389 /* Tally up the load of all CPUs in the group */ 2729 /* Tally up the load of all CPUs in the group */
2390 max_cpu_load = 0; 2730 max_cpu_load = 0;
2391 min_cpu_load = ~0UL; 2731 min_cpu_load = ~0UL;
2732 max_nr_running = 0;
2392 2733
2393 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 2734 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2394 struct rq *rq = cpu_rq(i); 2735 struct rq *rq = cpu_rq(i);
2395 2736
2396 if (*sd_idle && rq->nr_running)
2397 *sd_idle = 0;
2398
2399 /* Bias balancing toward cpus of our domain */ 2737 /* Bias balancing toward cpus of our domain */
2400 if (local_group) { 2738 if (local_group) {
2401 if (idle_cpu(i) && !first_idle_cpu) { 2739 if (idle_cpu(i) && !first_idle_cpu) {
@@ -2406,8 +2744,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2406 load = target_load(i, load_idx); 2744 load = target_load(i, load_idx);
2407 } else { 2745 } else {
2408 load = source_load(i, load_idx); 2746 load = source_load(i, load_idx);
2409 if (load > max_cpu_load) 2747 if (load > max_cpu_load) {
2410 max_cpu_load = load; 2748 max_cpu_load = load;
2749 max_nr_running = rq->nr_running;
2750 }
2411 if (min_cpu_load > load) 2751 if (min_cpu_load > load)
2412 min_cpu_load = load; 2752 min_cpu_load = load;
2413 } 2753 }
@@ -2415,7 +2755,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2415 sgs->group_load += load; 2755 sgs->group_load += load;
2416 sgs->sum_nr_running += rq->nr_running; 2756 sgs->sum_nr_running += rq->nr_running;
2417 sgs->sum_weighted_load += weighted_cpuload(i); 2757 sgs->sum_weighted_load += weighted_cpuload(i);
2418 2758 if (idle_cpu(i))
2759 sgs->idle_cpus++;
2419 } 2760 }
2420 2761
2421 /* 2762 /*
@@ -2433,11 +2774,11 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2433 } 2774 }
2434 2775
2435 /* Adjust by relative CPU power of the group */ 2776 /* Adjust by relative CPU power of the group */
2436 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; 2777 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
2437 2778
2438 /* 2779 /*
2439 * Consider the group unbalanced when the imbalance is larger 2780 * Consider the group unbalanced when the imbalance is larger
2440 * than the average weight of two tasks. 2781 * than the average weight of a task.
2441 * 2782 *
2442 * APZ: with cgroup the avg task weight can vary wildly and 2783 * APZ: with cgroup the avg task weight can vary wildly and
2443 * might not be a suitable number - should we keep a 2784 * might not be a suitable number - should we keep a
@@ -2447,13 +2788,17 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2447 if (sgs->sum_nr_running) 2788 if (sgs->sum_nr_running)
2448 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 2789 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2449 2790
2450 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 2791 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
2451 sgs->group_imb = 1; 2792 sgs->group_imb = 1;
2452 2793
2453 sgs->group_capacity = 2794 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
2454 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2795 SCHED_POWER_SCALE);
2455 if (!sgs->group_capacity) 2796 if (!sgs->group_capacity)
2456 sgs->group_capacity = fix_small_capacity(sd, group); 2797 sgs->group_capacity = fix_small_capacity(sd, group);
2798 sgs->group_weight = group->group_weight;
2799
2800 if (sgs->group_capacity > sgs->sum_nr_running)
2801 sgs->group_has_capacity = 1;
2457} 2802}
2458 2803
2459/** 2804/**
@@ -2504,15 +2849,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
2504 * @sd: sched_domain whose statistics are to be updated. 2849 * @sd: sched_domain whose statistics are to be updated.
2505 * @this_cpu: Cpu for which load balance is currently performed. 2850 * @this_cpu: Cpu for which load balance is currently performed.
2506 * @idle: Idle status of this_cpu 2851 * @idle: Idle status of this_cpu
2507 * @sd_idle: Idle status of the sched_domain containing sg.
2508 * @cpus: Set of cpus considered for load balancing. 2852 * @cpus: Set of cpus considered for load balancing.
2509 * @balance: Should we balance. 2853 * @balance: Should we balance.
2510 * @sds: variable to hold the statistics for this sched_domain. 2854 * @sds: variable to hold the statistics for this sched_domain.
2511 */ 2855 */
2512static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 2856static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2513 enum cpu_idle_type idle, int *sd_idle, 2857 enum cpu_idle_type idle, const struct cpumask *cpus,
2514 const struct cpumask *cpus, int *balance, 2858 int *balance, struct sd_lb_stats *sds)
2515 struct sd_lb_stats *sds)
2516{ 2859{
2517 struct sched_domain *child = sd->child; 2860 struct sched_domain *child = sd->child;
2518 struct sched_group *sg = sd->groups; 2861 struct sched_group *sg = sd->groups;
@@ -2530,21 +2873,26 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2530 2873
2531 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); 2874 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2532 memset(&sgs, 0, sizeof(sgs)); 2875 memset(&sgs, 0, sizeof(sgs));
2533 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, 2876 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
2534 local_group, cpus, balance, &sgs); 2877 local_group, cpus, balance, &sgs);
2535 2878
2536 if (local_group && !(*balance)) 2879 if (local_group && !(*balance))
2537 return; 2880 return;
2538 2881
2539 sds->total_load += sgs.group_load; 2882 sds->total_load += sgs.group_load;
2540 sds->total_pwr += sg->cpu_power; 2883 sds->total_pwr += sg->sgp->power;
2541 2884
2542 /* 2885 /*
2543 * In case the child domain prefers tasks go to siblings 2886 * In case the child domain prefers tasks go to siblings
2544 * first, lower the sg capacity to one so that we'll try 2887 * first, lower the sg capacity to one so that we'll try
2545 * and move all the excess tasks away. 2888 * and move all the excess tasks away. We lower the capacity
2889 * of a group only if the local group has the capacity to fit
2890 * these excess tasks, i.e. nr_running < group_capacity. The
2891 * extra check prevents the case where you always pull from the
2892 * heaviest group when it is already under-utilized (possible
2893 * with a large weight task outweighs the tasks on the system).
2546 */ 2894 */
2547 if (prefer_sibling) 2895 if (prefer_sibling && !local_group && sds->this_has_capacity)
2548 sgs.group_capacity = min(sgs.group_capacity, 1UL); 2896 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2549 2897
2550 if (local_group) { 2898 if (local_group) {
@@ -2552,12 +2900,17 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2552 sds->this = sg; 2900 sds->this = sg;
2553 sds->this_nr_running = sgs.sum_nr_running; 2901 sds->this_nr_running = sgs.sum_nr_running;
2554 sds->this_load_per_task = sgs.sum_weighted_load; 2902 sds->this_load_per_task = sgs.sum_weighted_load;
2903 sds->this_has_capacity = sgs.group_has_capacity;
2904 sds->this_idle_cpus = sgs.idle_cpus;
2555 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 2905 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2556 sds->max_load = sgs.avg_load; 2906 sds->max_load = sgs.avg_load;
2557 sds->busiest = sg; 2907 sds->busiest = sg;
2558 sds->busiest_nr_running = sgs.sum_nr_running; 2908 sds->busiest_nr_running = sgs.sum_nr_running;
2909 sds->busiest_idle_cpus = sgs.idle_cpus;
2559 sds->busiest_group_capacity = sgs.group_capacity; 2910 sds->busiest_group_capacity = sgs.group_capacity;
2560 sds->busiest_load_per_task = sgs.sum_weighted_load; 2911 sds->busiest_load_per_task = sgs.sum_weighted_load;
2912 sds->busiest_has_capacity = sgs.group_has_capacity;
2913 sds->busiest_group_weight = sgs.group_weight;
2561 sds->group_imb = sgs.group_imb; 2914 sds->group_imb = sgs.group_imb;
2562 } 2915 }
2563 2916
@@ -2612,8 +2965,8 @@ static int check_asym_packing(struct sched_domain *sd,
2612 if (this_cpu > busiest_cpu) 2965 if (this_cpu > busiest_cpu)
2613 return 0; 2966 return 0;
2614 2967
2615 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, 2968 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
2616 SCHED_LOAD_SCALE); 2969 SCHED_POWER_SCALE);
2617 return 1; 2970 return 1;
2618} 2971}
2619 2972
@@ -2642,8 +2995,8 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2642 cpu_avg_load_per_task(this_cpu); 2995 cpu_avg_load_per_task(this_cpu);
2643 2996
2644 scaled_busy_load_per_task = sds->busiest_load_per_task 2997 scaled_busy_load_per_task = sds->busiest_load_per_task
2645 * SCHED_LOAD_SCALE; 2998 * SCHED_POWER_SCALE;
2646 scaled_busy_load_per_task /= sds->busiest->cpu_power; 2999 scaled_busy_load_per_task /= sds->busiest->sgp->power;
2647 3000
2648 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 3001 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
2649 (scaled_busy_load_per_task * imbn)) { 3002 (scaled_busy_load_per_task * imbn)) {
@@ -2657,30 +3010,30 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2657 * moving them. 3010 * moving them.
2658 */ 3011 */
2659 3012
2660 pwr_now += sds->busiest->cpu_power * 3013 pwr_now += sds->busiest->sgp->power *
2661 min(sds->busiest_load_per_task, sds->max_load); 3014 min(sds->busiest_load_per_task, sds->max_load);
2662 pwr_now += sds->this->cpu_power * 3015 pwr_now += sds->this->sgp->power *
2663 min(sds->this_load_per_task, sds->this_load); 3016 min(sds->this_load_per_task, sds->this_load);
2664 pwr_now /= SCHED_LOAD_SCALE; 3017 pwr_now /= SCHED_POWER_SCALE;
2665 3018
2666 /* Amount of load we'd subtract */ 3019 /* Amount of load we'd subtract */
2667 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / 3020 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
2668 sds->busiest->cpu_power; 3021 sds->busiest->sgp->power;
2669 if (sds->max_load > tmp) 3022 if (sds->max_load > tmp)
2670 pwr_move += sds->busiest->cpu_power * 3023 pwr_move += sds->busiest->sgp->power *
2671 min(sds->busiest_load_per_task, sds->max_load - tmp); 3024 min(sds->busiest_load_per_task, sds->max_load - tmp);
2672 3025
2673 /* Amount of load we'd add */ 3026 /* Amount of load we'd add */
2674 if (sds->max_load * sds->busiest->cpu_power < 3027 if (sds->max_load * sds->busiest->sgp->power <
2675 sds->busiest_load_per_task * SCHED_LOAD_SCALE) 3028 sds->busiest_load_per_task * SCHED_POWER_SCALE)
2676 tmp = (sds->max_load * sds->busiest->cpu_power) / 3029 tmp = (sds->max_load * sds->busiest->sgp->power) /
2677 sds->this->cpu_power; 3030 sds->this->sgp->power;
2678 else 3031 else
2679 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / 3032 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
2680 sds->this->cpu_power; 3033 sds->this->sgp->power;
2681 pwr_move += sds->this->cpu_power * 3034 pwr_move += sds->this->sgp->power *
2682 min(sds->this_load_per_task, sds->this_load + tmp); 3035 min(sds->this_load_per_task, sds->this_load + tmp);
2683 pwr_move /= SCHED_LOAD_SCALE; 3036 pwr_move /= SCHED_POWER_SCALE;
2684 3037
2685 /* Move if we gain throughput */ 3038 /* Move if we gain throughput */
2686 if (pwr_move > pwr_now) 3039 if (pwr_move > pwr_now)
@@ -2722,9 +3075,9 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2722 load_above_capacity = (sds->busiest_nr_running - 3075 load_above_capacity = (sds->busiest_nr_running -
2723 sds->busiest_group_capacity); 3076 sds->busiest_group_capacity);
2724 3077
2725 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); 3078 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
2726 3079
2727 load_above_capacity /= sds->busiest->cpu_power; 3080 load_above_capacity /= sds->busiest->sgp->power;
2728 } 3081 }
2729 3082
2730 /* 3083 /*
@@ -2740,13 +3093,13 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2740 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 3093 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
2741 3094
2742 /* How much load to actually move to equalise the imbalance */ 3095 /* How much load to actually move to equalise the imbalance */
2743 *imbalance = min(max_pull * sds->busiest->cpu_power, 3096 *imbalance = min(max_pull * sds->busiest->sgp->power,
2744 (sds->avg_load - sds->this_load) * sds->this->cpu_power) 3097 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
2745 / SCHED_LOAD_SCALE; 3098 / SCHED_POWER_SCALE;
2746 3099
2747 /* 3100 /*
2748 * if *imbalance is less than the average load per runnable task 3101 * if *imbalance is less than the average load per runnable task
2749 * there is no gaurantee that any tasks will be moved so we'll have 3102 * there is no guarantee that any tasks will be moved so we'll have
2750 * a think about bumping its value to force at least one task to be 3103 * a think about bumping its value to force at least one task to be
2751 * moved 3104 * moved
2752 */ 3105 */
@@ -2754,6 +3107,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2754 return fix_small_imbalance(sds, this_cpu, imbalance); 3107 return fix_small_imbalance(sds, this_cpu, imbalance);
2755 3108
2756} 3109}
3110
2757/******* find_busiest_group() helpers end here *********************/ 3111/******* find_busiest_group() helpers end here *********************/
2758 3112
2759/** 3113/**
@@ -2771,7 +3125,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2771 * @imbalance: Variable which stores amount of weighted load which should 3125 * @imbalance: Variable which stores amount of weighted load which should
2772 * be moved to restore balance/put a group to idle. 3126 * be moved to restore balance/put a group to idle.
2773 * @idle: The idle status of this_cpu. 3127 * @idle: The idle status of this_cpu.
2774 * @sd_idle: The idleness of sd
2775 * @cpus: The set of CPUs under consideration for load-balancing. 3128 * @cpus: The set of CPUs under consideration for load-balancing.
2776 * @balance: Pointer to a variable indicating if this_cpu 3129 * @balance: Pointer to a variable indicating if this_cpu
2777 * is the appropriate cpu to perform load balancing at this_level. 3130 * is the appropriate cpu to perform load balancing at this_level.
@@ -2784,7 +3137,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2784static struct sched_group * 3137static struct sched_group *
2785find_busiest_group(struct sched_domain *sd, int this_cpu, 3138find_busiest_group(struct sched_domain *sd, int this_cpu,
2786 unsigned long *imbalance, enum cpu_idle_type idle, 3139 unsigned long *imbalance, enum cpu_idle_type idle,
2787 int *sd_idle, const struct cpumask *cpus, int *balance) 3140 const struct cpumask *cpus, int *balance)
2788{ 3141{
2789 struct sd_lb_stats sds; 3142 struct sd_lb_stats sds;
2790 3143
@@ -2794,17 +3147,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2794 * Compute the various statistics relavent for load balancing at 3147 * Compute the various statistics relavent for load balancing at
2795 * this level. 3148 * this level.
2796 */ 3149 */
2797 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, 3150 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
2798 balance, &sds);
2799 3151
2800 /* Cases where imbalance does not exist from POV of this_cpu */ 3152 /*
2801 /* 1) this_cpu is not the appropriate cpu to perform load balancing 3153 * this_cpu is not the appropriate cpu to perform load balancing at
2802 * at this level. 3154 * this level.
2803 * 2) There is no busy sibling group to pull from.
2804 * 3) This group is the busiest group.
2805 * 4) This group is more busy than the avg busieness at this
2806 * sched_domain.
2807 * 5) The imbalance is within the specified limit.
2808 */ 3155 */
2809 if (!(*balance)) 3156 if (!(*balance))
2810 goto ret; 3157 goto ret;
@@ -2813,20 +3160,59 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2813 check_asym_packing(sd, &sds, this_cpu, imbalance)) 3160 check_asym_packing(sd, &sds, this_cpu, imbalance))
2814 return sds.busiest; 3161 return sds.busiest;
2815 3162
3163 /* There is no busy sibling group to pull tasks from */
2816 if (!sds.busiest || sds.busiest_nr_running == 0) 3164 if (!sds.busiest || sds.busiest_nr_running == 0)
2817 goto out_balanced; 3165 goto out_balanced;
2818 3166
3167 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
3168
3169 /*
3170 * If the busiest group is imbalanced the below checks don't
3171 * work because they assumes all things are equal, which typically
3172 * isn't true due to cpus_allowed constraints and the like.
3173 */
3174 if (sds.group_imb)
3175 goto force_balance;
3176
3177 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
3178 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
3179 !sds.busiest_has_capacity)
3180 goto force_balance;
3181
3182 /*
3183 * If the local group is more busy than the selected busiest group
3184 * don't try and pull any tasks.
3185 */
2819 if (sds.this_load >= sds.max_load) 3186 if (sds.this_load >= sds.max_load)
2820 goto out_balanced; 3187 goto out_balanced;
2821 3188
2822 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; 3189 /*
2823 3190 * Don't pull any tasks if this group is already above the domain
3191 * average load.
3192 */
2824 if (sds.this_load >= sds.avg_load) 3193 if (sds.this_load >= sds.avg_load)
2825 goto out_balanced; 3194 goto out_balanced;
2826 3195
2827 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 3196 if (idle == CPU_IDLE) {
2828 goto out_balanced; 3197 /*
3198 * This cpu is idle. If the busiest group load doesn't
3199 * have more tasks than the number of available cpu's and
3200 * there is no imbalance between this and busiest group
3201 * wrt to idle cpu's, it is balanced.
3202 */
3203 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
3204 sds.busiest_nr_running <= sds.busiest_group_weight)
3205 goto out_balanced;
3206 } else {
3207 /*
3208 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
3209 * imbalance_pct to be conservative.
3210 */
3211 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3212 goto out_balanced;
3213 }
2829 3214
3215force_balance:
2830 /* Looks like there is an imbalance. Compute it */ 3216 /* Looks like there is an imbalance. Compute it */
2831 calculate_imbalance(&sds, this_cpu, imbalance); 3217 calculate_imbalance(&sds, this_cpu, imbalance);
2832 return sds.busiest; 3218 return sds.busiest;
@@ -2857,7 +3243,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
2857 3243
2858 for_each_cpu(i, sched_group_cpus(group)) { 3244 for_each_cpu(i, sched_group_cpus(group)) {
2859 unsigned long power = power_of(i); 3245 unsigned long power = power_of(i);
2860 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 3246 unsigned long capacity = DIV_ROUND_CLOSEST(power,
3247 SCHED_POWER_SCALE);
2861 unsigned long wl; 3248 unsigned long wl;
2862 3249
2863 if (!capacity) 3250 if (!capacity)
@@ -2882,7 +3269,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
2882 * the load can be moved away from the cpu that is potentially 3269 * the load can be moved away from the cpu that is potentially
2883 * running at a lower capacity. 3270 * running at a lower capacity.
2884 */ 3271 */
2885 wl = (wl * SCHED_LOAD_SCALE) / power; 3272 wl = (wl * SCHED_POWER_SCALE) / power;
2886 3273
2887 if (wl > max_load) { 3274 if (wl > max_load) {
2888 max_load = wl; 3275 max_load = wl;
@@ -2902,7 +3289,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
2902/* Working cpumask for load_balance and load_balance_newidle. */ 3289/* Working cpumask for load_balance and load_balance_newidle. */
2903static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 3290static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2904 3291
2905static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, 3292static int need_active_balance(struct sched_domain *sd, int idle,
2906 int busiest_cpu, int this_cpu) 3293 int busiest_cpu, int this_cpu)
2907{ 3294{
2908 if (idle == CPU_NEWLY_IDLE) { 3295 if (idle == CPU_NEWLY_IDLE) {
@@ -2934,10 +3321,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
2934 * move_tasks() will succeed. ld_moved will be true and this 3321 * move_tasks() will succeed. ld_moved will be true and this
2935 * active balance code will not be triggered. 3322 * active balance code will not be triggered.
2936 */ 3323 */
2937 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2938 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2939 return 0;
2940
2941 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) 3324 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
2942 return 0; 3325 return 0;
2943 } 3326 }
@@ -2955,7 +3338,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2955 struct sched_domain *sd, enum cpu_idle_type idle, 3338 struct sched_domain *sd, enum cpu_idle_type idle,
2956 int *balance) 3339 int *balance)
2957{ 3340{
2958 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3341 int ld_moved, all_pinned = 0, active_balance = 0;
2959 struct sched_group *group; 3342 struct sched_group *group;
2960 unsigned long imbalance; 3343 unsigned long imbalance;
2961 struct rq *busiest; 3344 struct rq *busiest;
@@ -2964,21 +3347,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2964 3347
2965 cpumask_copy(cpus, cpu_active_mask); 3348 cpumask_copy(cpus, cpu_active_mask);
2966 3349
2967 /*
2968 * When power savings policy is enabled for the parent domain, idle
2969 * sibling can pick up load irrespective of busy siblings. In this case,
2970 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2971 * portraying it as CPU_NOT_IDLE.
2972 */
2973 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2974 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2975 sd_idle = 1;
2976
2977 schedstat_inc(sd, lb_count[idle]); 3350 schedstat_inc(sd, lb_count[idle]);
2978 3351
2979redo: 3352redo:
2980 update_shares(sd); 3353 group = find_busiest_group(sd, this_cpu, &imbalance, idle,
2981 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2982 cpus, balance); 3354 cpus, balance);
2983 3355
2984 if (*balance == 0) 3356 if (*balance == 0)
@@ -3007,6 +3379,7 @@ redo:
3007 * still unbalanced. ld_moved simply stays zero, so it is 3379 * still unbalanced. ld_moved simply stays zero, so it is
3008 * correctly treated as an imbalance. 3380 * correctly treated as an imbalance.
3009 */ 3381 */
3382 all_pinned = 1;
3010 local_irq_save(flags); 3383 local_irq_save(flags);
3011 double_rq_lock(this_rq, busiest); 3384 double_rq_lock(this_rq, busiest);
3012 ld_moved = move_tasks(this_rq, this_cpu, busiest, 3385 ld_moved = move_tasks(this_rq, this_cpu, busiest,
@@ -3031,10 +3404,16 @@ redo:
3031 3404
3032 if (!ld_moved) { 3405 if (!ld_moved) {
3033 schedstat_inc(sd, lb_failed[idle]); 3406 schedstat_inc(sd, lb_failed[idle]);
3034 sd->nr_balance_failed++; 3407 /*
3408 * Increment the failure counter only on periodic balance.
3409 * We do not want newidle balance, which can be very
3410 * frequent, pollute the failure counter causing
3411 * excessive cache_hot migrations and active balances.
3412 */
3413 if (idle != CPU_NEWLY_IDLE)
3414 sd->nr_balance_failed++;
3035 3415
3036 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), 3416 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
3037 this_cpu)) {
3038 raw_spin_lock_irqsave(&busiest->lock, flags); 3417 raw_spin_lock_irqsave(&busiest->lock, flags);
3039 3418
3040 /* don't kick the active_load_balance_cpu_stop, 3419 /* don't kick the active_load_balance_cpu_stop,
@@ -3089,10 +3468,6 @@ redo:
3089 sd->balance_interval *= 2; 3468 sd->balance_interval *= 2;
3090 } 3469 }
3091 3470
3092 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3093 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3094 ld_moved = -1;
3095
3096 goto out; 3471 goto out;
3097 3472
3098out_balanced: 3473out_balanced:
@@ -3106,14 +3481,8 @@ out_one_pinned:
3106 (sd->balance_interval < sd->max_interval)) 3481 (sd->balance_interval < sd->max_interval))
3107 sd->balance_interval *= 2; 3482 sd->balance_interval *= 2;
3108 3483
3109 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3484 ld_moved = 0;
3110 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3111 ld_moved = -1;
3112 else
3113 ld_moved = 0;
3114out: 3485out:
3115 if (ld_moved)
3116 update_shares(sd);
3117 return ld_moved; 3486 return ld_moved;
3118} 3487}
3119 3488
@@ -3137,6 +3506,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3137 */ 3506 */
3138 raw_spin_unlock(&this_rq->lock); 3507 raw_spin_unlock(&this_rq->lock);
3139 3508
3509 update_shares(this_cpu);
3510 rcu_read_lock();
3140 for_each_domain(this_cpu, sd) { 3511 for_each_domain(this_cpu, sd) {
3141 unsigned long interval; 3512 unsigned long interval;
3142 int balance = 1; 3513 int balance = 1;
@@ -3158,6 +3529,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3158 break; 3529 break;
3159 } 3530 }
3160 } 3531 }
3532 rcu_read_unlock();
3161 3533
3162 raw_spin_lock(&this_rq->lock); 3534 raw_spin_lock(&this_rq->lock);
3163 3535
@@ -3206,6 +3578,7 @@ static int active_load_balance_cpu_stop(void *data)
3206 double_lock_balance(busiest_rq, target_rq); 3578 double_lock_balance(busiest_rq, target_rq);
3207 3579
3208 /* Search for an sd spanning us and the target CPU. */ 3580 /* Search for an sd spanning us and the target CPU. */
3581 rcu_read_lock();
3209 for_each_domain(target_cpu, sd) { 3582 for_each_domain(target_cpu, sd) {
3210 if ((sd->flags & SD_LOAD_BALANCE) && 3583 if ((sd->flags & SD_LOAD_BALANCE) &&
3211 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) 3584 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
@@ -3221,6 +3594,7 @@ static int active_load_balance_cpu_stop(void *data)
3221 else 3594 else
3222 schedstat_inc(sd, alb_failed); 3595 schedstat_inc(sd, alb_failed);
3223 } 3596 }
3597 rcu_read_unlock();
3224 double_unlock_balance(busiest_rq, target_rq); 3598 double_unlock_balance(busiest_rq, target_rq);
3225out_unlock: 3599out_unlock:
3226 busiest_rq->active_balance = 0; 3600 busiest_rq->active_balance = 0;
@@ -3347,6 +3721,7 @@ static int find_new_ilb(int cpu)
3347{ 3721{
3348 struct sched_domain *sd; 3722 struct sched_domain *sd;
3349 struct sched_group *ilb_group; 3723 struct sched_group *ilb_group;
3724 int ilb = nr_cpu_ids;
3350 3725
3351 /* 3726 /*
3352 * Have idle load balancer selection from semi-idle packages only 3727 * Have idle load balancer selection from semi-idle packages only
@@ -3362,20 +3737,25 @@ static int find_new_ilb(int cpu)
3362 if (cpumask_weight(nohz.idle_cpus_mask) < 2) 3737 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3363 goto out_done; 3738 goto out_done;
3364 3739
3740 rcu_read_lock();
3365 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3741 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3366 ilb_group = sd->groups; 3742 ilb_group = sd->groups;
3367 3743
3368 do { 3744 do {
3369 if (is_semi_idle_group(ilb_group)) 3745 if (is_semi_idle_group(ilb_group)) {
3370 return cpumask_first(nohz.grp_idle_mask); 3746 ilb = cpumask_first(nohz.grp_idle_mask);
3747 goto unlock;
3748 }
3371 3749
3372 ilb_group = ilb_group->next; 3750 ilb_group = ilb_group->next;
3373 3751
3374 } while (ilb_group != sd->groups); 3752 } while (ilb_group != sd->groups);
3375 } 3753 }
3754unlock:
3755 rcu_read_unlock();
3376 3756
3377out_done: 3757out_done:
3378 return nr_cpu_ids; 3758 return ilb;
3379} 3759}
3380#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3760#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3381static inline int find_new_ilb(int call_cpu) 3761static inline int find_new_ilb(int call_cpu)
@@ -3490,6 +3870,17 @@ void select_nohz_load_balancer(int stop_tick)
3490 3870
3491static DEFINE_SPINLOCK(balancing); 3871static DEFINE_SPINLOCK(balancing);
3492 3872
3873static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3874
3875/*
3876 * Scale the max load_balance interval with the number of CPUs in the system.
3877 * This trades load-balance latency on larger machines for less cross talk.
3878 */
3879static void update_max_interval(void)
3880{
3881 max_load_balance_interval = HZ*num_online_cpus()/10;
3882}
3883
3493/* 3884/*
3494 * It checks each scheduling domain to see if it is due to be balanced, 3885 * It checks each scheduling domain to see if it is due to be balanced,
3495 * and initiates a balancing operation if so. 3886 * and initiates a balancing operation if so.
@@ -3507,6 +3898,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3507 int update_next_balance = 0; 3898 int update_next_balance = 0;
3508 int need_serialize; 3899 int need_serialize;
3509 3900
3901 update_shares(cpu);
3902
3903 rcu_read_lock();
3510 for_each_domain(cpu, sd) { 3904 for_each_domain(cpu, sd) {
3511 if (!(sd->flags & SD_LOAD_BALANCE)) 3905 if (!(sd->flags & SD_LOAD_BALANCE))
3512 continue; 3906 continue;
@@ -3517,10 +3911,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3517 3911
3518 /* scale ms to jiffies */ 3912 /* scale ms to jiffies */
3519 interval = msecs_to_jiffies(interval); 3913 interval = msecs_to_jiffies(interval);
3520 if (unlikely(!interval)) 3914 interval = clamp(interval, 1UL, max_load_balance_interval);
3521 interval = 1;
3522 if (interval > HZ*NR_CPUS/10)
3523 interval = HZ*NR_CPUS/10;
3524 3915
3525 need_serialize = sd->flags & SD_SERIALIZE; 3916 need_serialize = sd->flags & SD_SERIALIZE;
3526 3917
@@ -3533,8 +3924,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3533 if (load_balance(cpu, rq, sd, idle, &balance)) { 3924 if (load_balance(cpu, rq, sd, idle, &balance)) {
3534 /* 3925 /*
3535 * We've pulled tasks over so either we're no 3926 * We've pulled tasks over so either we're no
3536 * longer idle, or one of our SMT siblings is 3927 * longer idle.
3537 * not idle.
3538 */ 3928 */
3539 idle = CPU_NOT_IDLE; 3929 idle = CPU_NOT_IDLE;
3540 } 3930 }
@@ -3556,6 +3946,7 @@ out:
3556 if (!balance) 3946 if (!balance)
3557 break; 3947 break;
3558 } 3948 }
3949 rcu_read_unlock();
3559 3950
3560 /* 3951 /*
3561 * next_balance will be updated only when there is a need. 3952 * next_balance will be updated only when there is a need.
@@ -3751,8 +4142,11 @@ static void task_fork_fair(struct task_struct *p)
3751 4142
3752 update_rq_clock(rq); 4143 update_rq_clock(rq);
3753 4144
3754 if (unlikely(task_cpu(p) != this_cpu)) 4145 if (unlikely(task_cpu(p) != this_cpu)) {
4146 rcu_read_lock();
3755 __set_task_cpu(p, this_cpu); 4147 __set_task_cpu(p, this_cpu);
4148 rcu_read_unlock();
4149 }
3756 4150
3757 update_curr(cfs_rq); 4151 update_curr(cfs_rq);
3758 4152
@@ -3778,33 +4172,62 @@ static void task_fork_fair(struct task_struct *p)
3778 * Priority of the task has changed. Check to see if we preempt 4172 * Priority of the task has changed. Check to see if we preempt
3779 * the current task. 4173 * the current task.
3780 */ 4174 */
3781static void prio_changed_fair(struct rq *rq, struct task_struct *p, 4175static void
3782 int oldprio, int running) 4176prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
3783{ 4177{
4178 if (!p->se.on_rq)
4179 return;
4180
3784 /* 4181 /*
3785 * Reschedule if we are currently running on this runqueue and 4182 * Reschedule if we are currently running on this runqueue and
3786 * our priority decreased, or if we are not currently running on 4183 * our priority decreased, or if we are not currently running on
3787 * this runqueue and our priority is higher than the current's 4184 * this runqueue and our priority is higher than the current's
3788 */ 4185 */
3789 if (running) { 4186 if (rq->curr == p) {
3790 if (p->prio > oldprio) 4187 if (p->prio > oldprio)
3791 resched_task(rq->curr); 4188 resched_task(rq->curr);
3792 } else 4189 } else
3793 check_preempt_curr(rq, p, 0); 4190 check_preempt_curr(rq, p, 0);
3794} 4191}
3795 4192
4193static void switched_from_fair(struct rq *rq, struct task_struct *p)
4194{
4195 struct sched_entity *se = &p->se;
4196 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4197
4198 /*
4199 * Ensure the task's vruntime is normalized, so that when its
4200 * switched back to the fair class the enqueue_entity(.flags=0) will
4201 * do the right thing.
4202 *
4203 * If it was on_rq, then the dequeue_entity(.flags=0) will already
4204 * have normalized the vruntime, if it was !on_rq, then only when
4205 * the task is sleeping will it still have non-normalized vruntime.
4206 */
4207 if (!se->on_rq && p->state != TASK_RUNNING) {
4208 /*
4209 * Fix up our vruntime so that the current sleep doesn't
4210 * cause 'unlimited' sleep bonus.
4211 */
4212 place_entity(cfs_rq, se, 0);
4213 se->vruntime -= cfs_rq->min_vruntime;
4214 }
4215}
4216
3796/* 4217/*
3797 * We switched to the sched_fair class. 4218 * We switched to the sched_fair class.
3798 */ 4219 */
3799static void switched_to_fair(struct rq *rq, struct task_struct *p, 4220static void switched_to_fair(struct rq *rq, struct task_struct *p)
3800 int running)
3801{ 4221{
4222 if (!p->se.on_rq)
4223 return;
4224
3802 /* 4225 /*
3803 * We were most likely switched from sched_rt, so 4226 * We were most likely switched from sched_rt, so
3804 * kick off the schedule if running, otherwise just see 4227 * kick off the schedule if running, otherwise just see
3805 * if we can still preempt the current task. 4228 * if we can still preempt the current task.
3806 */ 4229 */
3807 if (running) 4230 if (rq->curr == p)
3808 resched_task(rq->curr); 4231 resched_task(rq->curr);
3809 else 4232 else
3810 check_preempt_curr(rq, p, 0); 4233 check_preempt_curr(rq, p, 0);
@@ -3824,13 +4247,26 @@ static void set_curr_task_fair(struct rq *rq)
3824} 4247}
3825 4248
3826#ifdef CONFIG_FAIR_GROUP_SCHED 4249#ifdef CONFIG_FAIR_GROUP_SCHED
3827static void moved_group_fair(struct task_struct *p, int on_rq) 4250static void task_move_group_fair(struct task_struct *p, int on_rq)
3828{ 4251{
3829 struct cfs_rq *cfs_rq = task_cfs_rq(p); 4252 /*
3830 4253 * If the task was not on the rq at the time of this cgroup movement
3831 update_curr(cfs_rq); 4254 * it must have been asleep, sleeping tasks keep their ->vruntime
4255 * absolute on their old rq until wakeup (needed for the fair sleeper
4256 * bonus in place_entity()).
4257 *
4258 * If it was on the rq, we've just 'preempted' it, which does convert
4259 * ->vruntime to a relative base.
4260 *
4261 * Make sure both cases convert their relative position when migrating
4262 * to another cgroup's rq. This does somewhat interfere with the
4263 * fair sleeper stuff for the first placement, but who cares.
4264 */
4265 if (!on_rq)
4266 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
4267 set_task_rq(p, task_cpu(p));
3832 if (!on_rq) 4268 if (!on_rq)
3833 place_entity(cfs_rq, &p->se, 1); 4269 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
3834} 4270}
3835#endif 4271#endif
3836 4272
@@ -3857,6 +4293,7 @@ static const struct sched_class fair_sched_class = {
3857 .enqueue_task = enqueue_task_fair, 4293 .enqueue_task = enqueue_task_fair,
3858 .dequeue_task = dequeue_task_fair, 4294 .dequeue_task = dequeue_task_fair,
3859 .yield_task = yield_task_fair, 4295 .yield_task = yield_task_fair,
4296 .yield_to_task = yield_to_task_fair,
3860 4297
3861 .check_preempt_curr = check_preempt_wakeup, 4298 .check_preempt_curr = check_preempt_wakeup,
3862 4299
@@ -3877,12 +4314,13 @@ static const struct sched_class fair_sched_class = {
3877 .task_fork = task_fork_fair, 4314 .task_fork = task_fork_fair,
3878 4315
3879 .prio_changed = prio_changed_fair, 4316 .prio_changed = prio_changed_fair,
4317 .switched_from = switched_from_fair,
3880 .switched_to = switched_to_fair, 4318 .switched_to = switched_to_fair,
3881 4319
3882 .get_rr_interval = get_rr_interval_fair, 4320 .get_rr_interval = get_rr_interval_fair,
3883 4321
3884#ifdef CONFIG_FAIR_GROUP_SCHED 4322#ifdef CONFIG_FAIR_GROUP_SCHED
3885 .moved_group = moved_group_fair, 4323 .task_move_group = task_move_group_fair,
3886#endif 4324#endif
3887}; 4325};
3888 4326