aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2009-09-10 07:50:02 -0400
committerIngo Molnar <mingo@elte.hu>2009-09-15 10:01:05 -0400
commitc88d5910890ad35af283344417891344604f0438 (patch)
tree4e2025d569c3e03a7ec5163f0a9bc159114ee14e
parente9c8431185d6c406887190519f6dbdd112641686 (diff)
sched: Merge select_task_rq_fair() and sched_balance_self()
The problem with wake_idle() is that is doesn't respect things like cpu_power, which means it doesn't deal well with SMT nor the recent RT interaction. To cure this, it needs to do what sched_balance_self() does, which leads to the possibility of merging select_task_rq_fair() and sched_balance_self(). Modify sched_balance_self() to: - update_shares() when walking up the domain tree, (it only called it for the top domain, but it should have done this anyway), which allows us to remove this ugly bit from try_to_wake_up(). - do wake_affine() on the smallest domain that contains both this (the waking) and the prev (the wakee) cpu for WAKE invocations. Then use the top-down balance steps it had to replace wake_idle(). This leads to the dissapearance of SD_WAKE_BALANCE and SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE. SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective. Touch all topology bits to replace the old with new SD flags -- platforms might need re-tuning, enabling SD_BALANCE_WAKE conditionally on a NUMA distance seems like a good additional feature, magny-core and small nehalem systems would want this enabled, systems with slow interconnects would not. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--arch/ia64/include/asm/topology.h5
-rw-r--r--arch/mips/include/asm/mach-ip27/topology.h2
-rw-r--r--arch/powerpc/include/asm/topology.h5
-rw-r--r--arch/sh/include/asm/topology.h4
-rw-r--r--arch/sparc/include/asm/topology_64.h4
-rw-r--r--arch/x86/include/asm/topology.h4
-rw-r--r--include/linux/sched.h7
-rw-r--r--include/linux/topology.h16
-rw-r--r--kernel/sched.c41
-rw-r--r--kernel/sched_fair.c233
10 files changed, 84 insertions, 237 deletions
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 7b4c8c70b2d1..cf6053b226c3 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -67,6 +67,7 @@ void build_cpu_to_node_map(void);
67 .flags = SD_LOAD_BALANCE \ 67 .flags = SD_LOAD_BALANCE \
68 | SD_BALANCE_NEWIDLE \ 68 | SD_BALANCE_NEWIDLE \
69 | SD_BALANCE_EXEC \ 69 | SD_BALANCE_EXEC \
70 | SD_BALANCE_WAKE \
70 | SD_WAKE_AFFINE, \ 71 | SD_WAKE_AFFINE, \
71 .last_balance = jiffies, \ 72 .last_balance = jiffies, \
72 .balance_interval = 1, \ 73 .balance_interval = 1, \
@@ -91,8 +92,8 @@ void build_cpu_to_node_map(void);
91 .flags = SD_LOAD_BALANCE \ 92 .flags = SD_LOAD_BALANCE \
92 | SD_BALANCE_EXEC \ 93 | SD_BALANCE_EXEC \
93 | SD_BALANCE_FORK \ 94 | SD_BALANCE_FORK \
94 | SD_SERIALIZE \ 95 | SD_BALANCE_WAKE \
95 | SD_WAKE_BALANCE, \ 96 | SD_SERIALIZE, \
96 .last_balance = jiffies, \ 97 .last_balance = jiffies, \
97 .balance_interval = 64, \ 98 .balance_interval = 64, \
98 .nr_balance_failed = 0, \ 99 .nr_balance_failed = 0, \
diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h
index 07547231e078..d8332398f5be 100644
--- a/arch/mips/include/asm/mach-ip27/topology.h
+++ b/arch/mips/include/asm/mach-ip27/topology.h
@@ -48,7 +48,7 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
48 .cache_nice_tries = 1, \ 48 .cache_nice_tries = 1, \
49 .flags = SD_LOAD_BALANCE \ 49 .flags = SD_LOAD_BALANCE \
50 | SD_BALANCE_EXEC \ 50 | SD_BALANCE_EXEC \
51 | SD_WAKE_BALANCE, \ 51 | SD_BALANCE_WAKE, \
52 .last_balance = jiffies, \ 52 .last_balance = jiffies, \
53 .balance_interval = 1, \ 53 .balance_interval = 1, \
54 .nr_balance_failed = 0, \ 54 .nr_balance_failed = 0, \
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 054a16d68082..c6343313ff59 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -62,9 +62,8 @@ static inline int pcibus_to_node(struct pci_bus *bus)
62 .flags = SD_LOAD_BALANCE \ 62 .flags = SD_LOAD_BALANCE \
63 | SD_BALANCE_EXEC \ 63 | SD_BALANCE_EXEC \
64 | SD_BALANCE_NEWIDLE \ 64 | SD_BALANCE_NEWIDLE \
65 | SD_WAKE_IDLE \ 65 | SD_BALANCE_WAKE \
66 | SD_SERIALIZE \ 66 | SD_SERIALIZE, \
67 | SD_WAKE_BALANCE, \
68 .last_balance = jiffies, \ 67 .last_balance = jiffies, \
69 .balance_interval = 1, \ 68 .balance_interval = 1, \
70 .nr_balance_failed = 0, \ 69 .nr_balance_failed = 0, \
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index b69ee850906d..dc1531e2f25f 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -21,8 +21,8 @@
21 .flags = SD_LOAD_BALANCE \ 21 .flags = SD_LOAD_BALANCE \
22 | SD_BALANCE_FORK \ 22 | SD_BALANCE_FORK \
23 | SD_BALANCE_EXEC \ 23 | SD_BALANCE_EXEC \
24 | SD_SERIALIZE \ 24 | SD_BALANCE_WAKE \
25 | SD_WAKE_BALANCE, \ 25 | SD_SERIALIZE, \
26 .last_balance = jiffies, \ 26 .last_balance = jiffies, \
27 .balance_interval = 1, \ 27 .balance_interval = 1, \
28 .nr_balance_failed = 0, \ 28 .nr_balance_failed = 0, \
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index e5ea8d332421..1d091abd2d13 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -57,8 +57,8 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
57 .flags = SD_LOAD_BALANCE \ 57 .flags = SD_LOAD_BALANCE \
58 | SD_BALANCE_FORK \ 58 | SD_BALANCE_FORK \
59 | SD_BALANCE_EXEC \ 59 | SD_BALANCE_EXEC \
60 | SD_SERIALIZE \ 60 | SD_BALANCE_WAKE \
61 | SD_WAKE_BALANCE, \ 61 | SD_SERIALIZE, \
62 .last_balance = jiffies, \ 62 .last_balance = jiffies, \
63 .balance_interval = 1, \ 63 .balance_interval = 1, \
64} 64}
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 26d06e052a18..966d58dc6274 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -145,14 +145,12 @@ extern unsigned long node_remap_size[];
145 | 1*SD_BALANCE_NEWIDLE \ 145 | 1*SD_BALANCE_NEWIDLE \
146 | 1*SD_BALANCE_EXEC \ 146 | 1*SD_BALANCE_EXEC \
147 | 1*SD_BALANCE_FORK \ 147 | 1*SD_BALANCE_FORK \
148 | 0*SD_WAKE_IDLE \ 148 | 1*SD_BALANCE_WAKE \
149 | 1*SD_WAKE_AFFINE \ 149 | 1*SD_WAKE_AFFINE \
150 | 1*SD_WAKE_BALANCE \
151 | 0*SD_SHARE_CPUPOWER \ 150 | 0*SD_SHARE_CPUPOWER \
152 | 0*SD_POWERSAVINGS_BALANCE \ 151 | 0*SD_POWERSAVINGS_BALANCE \
153 | 0*SD_SHARE_PKG_RESOURCES \ 152 | 0*SD_SHARE_PKG_RESOURCES \
154 | 1*SD_SERIALIZE \ 153 | 1*SD_SERIALIZE \
155 | 1*SD_WAKE_IDLE_FAR \
156 | 0*SD_PREFER_SIBLING \ 154 | 0*SD_PREFER_SIBLING \
157 , \ 155 , \
158 .last_balance = jiffies, \ 156 .last_balance = jiffies, \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3b0ca66bd6ce..c30bf3d516d1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -803,16 +803,15 @@ enum cpu_idle_type {
803#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ 803#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */
804#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ 804#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */
805#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ 805#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
806#define SD_WAKE_IDLE 0x0010 /* Wake to idle CPU on task wakeup */ 806#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
807#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ 807#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
808#define SD_WAKE_BALANCE 0x0040 /* Perform balancing at task wakeup */ 808
809#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ 809#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
810#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ 810#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */
811#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ 811#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
812#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ 812#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
813#define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */ 813
814#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ 814#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
815#define SD_BALANCE_WAKE 0x2000 /* Balance on wakeup */
816 815
817enum powersavings_balance_level { 816enum powersavings_balance_level {
818 POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ 817 POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 85e8cf7d393c..6a8cd15555bb 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -95,14 +95,12 @@ int arch_update_cpu_topology(void);
95 | 1*SD_BALANCE_NEWIDLE \ 95 | 1*SD_BALANCE_NEWIDLE \
96 | 1*SD_BALANCE_EXEC \ 96 | 1*SD_BALANCE_EXEC \
97 | 1*SD_BALANCE_FORK \ 97 | 1*SD_BALANCE_FORK \
98 | 0*SD_WAKE_IDLE \ 98 | 1*SD_BALANCE_WAKE \
99 | 1*SD_WAKE_AFFINE \ 99 | 1*SD_WAKE_AFFINE \
100 | 1*SD_WAKE_BALANCE \
101 | 1*SD_SHARE_CPUPOWER \ 100 | 1*SD_SHARE_CPUPOWER \
102 | 0*SD_POWERSAVINGS_BALANCE \ 101 | 0*SD_POWERSAVINGS_BALANCE \
103 | 0*SD_SHARE_PKG_RESOURCES \ 102 | 0*SD_SHARE_PKG_RESOURCES \
104 | 0*SD_SERIALIZE \ 103 | 0*SD_SERIALIZE \
105 | 0*SD_WAKE_IDLE_FAR \
106 | 0*SD_PREFER_SIBLING \ 104 | 0*SD_PREFER_SIBLING \
107 , \ 105 , \
108 .last_balance = jiffies, \ 106 .last_balance = jiffies, \
@@ -129,13 +127,11 @@ int arch_update_cpu_topology(void);
129 | 1*SD_BALANCE_NEWIDLE \ 127 | 1*SD_BALANCE_NEWIDLE \
130 | 1*SD_BALANCE_EXEC \ 128 | 1*SD_BALANCE_EXEC \
131 | 1*SD_BALANCE_FORK \ 129 | 1*SD_BALANCE_FORK \
132 | 1*SD_WAKE_IDLE \ 130 | 1*SD_BALANCE_WAKE \
133 | 1*SD_WAKE_AFFINE \ 131 | 1*SD_WAKE_AFFINE \
134 | 1*SD_WAKE_BALANCE \
135 | 0*SD_SHARE_CPUPOWER \ 132 | 0*SD_SHARE_CPUPOWER \
136 | 1*SD_SHARE_PKG_RESOURCES \ 133 | 1*SD_SHARE_PKG_RESOURCES \
137 | 0*SD_SERIALIZE \ 134 | 0*SD_SERIALIZE \
138 | 0*SD_WAKE_IDLE_FAR \
139 | sd_balance_for_mc_power() \ 135 | sd_balance_for_mc_power() \
140 | sd_power_saving_flags() \ 136 | sd_power_saving_flags() \
141 , \ 137 , \
@@ -163,13 +159,11 @@ int arch_update_cpu_topology(void);
163 | 1*SD_BALANCE_NEWIDLE \ 159 | 1*SD_BALANCE_NEWIDLE \
164 | 1*SD_BALANCE_EXEC \ 160 | 1*SD_BALANCE_EXEC \
165 | 1*SD_BALANCE_FORK \ 161 | 1*SD_BALANCE_FORK \
166 | 1*SD_WAKE_IDLE \ 162 | 1*SD_BALANCE_WAKE \
167 | 0*SD_WAKE_AFFINE \ 163 | 0*SD_WAKE_AFFINE \
168 | 1*SD_WAKE_BALANCE \
169 | 0*SD_SHARE_CPUPOWER \ 164 | 0*SD_SHARE_CPUPOWER \
170 | 0*SD_SHARE_PKG_RESOURCES \ 165 | 0*SD_SHARE_PKG_RESOURCES \
171 | 0*SD_SERIALIZE \ 166 | 0*SD_SERIALIZE \
172 | 0*SD_WAKE_IDLE_FAR \
173 | sd_balance_for_package_power() \ 167 | sd_balance_for_package_power() \
174 | sd_power_saving_flags() \ 168 | sd_power_saving_flags() \
175 , \ 169 , \
@@ -191,14 +185,12 @@ int arch_update_cpu_topology(void);
191 | 1*SD_BALANCE_NEWIDLE \ 185 | 1*SD_BALANCE_NEWIDLE \
192 | 0*SD_BALANCE_EXEC \ 186 | 0*SD_BALANCE_EXEC \
193 | 0*SD_BALANCE_FORK \ 187 | 0*SD_BALANCE_FORK \
194 | 0*SD_WAKE_IDLE \ 188 | 0*SD_BALANCE_WAKE \
195 | 1*SD_WAKE_AFFINE \ 189 | 1*SD_WAKE_AFFINE \
196 | 0*SD_WAKE_BALANCE \
197 | 0*SD_SHARE_CPUPOWER \ 190 | 0*SD_SHARE_CPUPOWER \
198 | 0*SD_POWERSAVINGS_BALANCE \ 191 | 0*SD_POWERSAVINGS_BALANCE \
199 | 0*SD_SHARE_PKG_RESOURCES \ 192 | 0*SD_SHARE_PKG_RESOURCES \
200 | 1*SD_SERIALIZE \ 193 | 1*SD_SERIALIZE \
201 | 1*SD_WAKE_IDLE_FAR \
202 | 0*SD_PREFER_SIBLING \ 194 | 0*SD_PREFER_SIBLING \
203 , \ 195 , \
204 .last_balance = jiffies, \ 196 .last_balance = jiffies, \
diff --git a/kernel/sched.c b/kernel/sched.c
index fc6fda881d2e..6c819f338b11 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -512,14 +512,6 @@ struct root_domain {
512#ifdef CONFIG_SMP 512#ifdef CONFIG_SMP
513 struct cpupri cpupri; 513 struct cpupri cpupri;
514#endif 514#endif
515#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
516 /*
517 * Preferred wake up cpu nominated by sched_mc balance that will be
518 * used when most cpus are idle in the system indicating overall very
519 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
520 */
521 unsigned int sched_mc_preferred_wakeup_cpu;
522#endif
523}; 515};
524 516
525/* 517/*
@@ -2315,22 +2307,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2315 if (!sched_feat(SYNC_WAKEUPS)) 2307 if (!sched_feat(SYNC_WAKEUPS))
2316 sync = 0; 2308 sync = 0;
2317 2309
2318#ifdef CONFIG_SMP
2319 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2320 struct sched_domain *sd;
2321
2322 this_cpu = raw_smp_processor_id();
2323 cpu = task_cpu(p);
2324
2325 for_each_domain(this_cpu, sd) {
2326 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2327 update_shares(sd);
2328 break;
2329 }
2330 }
2331 }
2332#endif
2333
2334 this_cpu = get_cpu(); 2310 this_cpu = get_cpu();
2335 2311
2336 smp_wmb(); 2312 smp_wmb();
@@ -3533,11 +3509,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3533 *imbalance = sds->min_load_per_task; 3509 *imbalance = sds->min_load_per_task;
3534 sds->busiest = sds->group_min; 3510 sds->busiest = sds->group_min;
3535 3511
3536 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3537 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3538 group_first_cpu(sds->group_leader);
3539 }
3540
3541 return 1; 3512 return 1;
3542 3513
3543} 3514}
@@ -7850,9 +7821,7 @@ static int sd_degenerate(struct sched_domain *sd)
7850 } 7821 }
7851 7822
7852 /* Following flags don't use groups */ 7823 /* Following flags don't use groups */
7853 if (sd->flags & (SD_WAKE_IDLE | 7824 if (sd->flags & (SD_WAKE_AFFINE))
7854 SD_WAKE_AFFINE |
7855 SD_WAKE_BALANCE))
7856 return 0; 7825 return 0;
7857 7826
7858 return 1; 7827 return 1;
@@ -7869,10 +7838,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7869 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 7838 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
7870 return 0; 7839 return 0;
7871 7840
7872 /* Does parent contain flags not in child? */
7873 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
7874 if (cflags & SD_WAKE_AFFINE)
7875 pflags &= ~SD_WAKE_BALANCE;
7876 /* Flags needing groups don't count if only 1 group in parent */ 7841 /* Flags needing groups don't count if only 1 group in parent */
7877 if (parent->groups == parent->groups->next) { 7842 if (parent->groups == parent->groups->next) {
7878 pflags &= ~(SD_LOAD_BALANCE | 7843 pflags &= ~(SD_LOAD_BALANCE |
@@ -8558,10 +8523,10 @@ static void set_domain_attribute(struct sched_domain *sd,
8558 request = attr->relax_domain_level; 8523 request = attr->relax_domain_level;
8559 if (request < sd->level) { 8524 if (request < sd->level) {
8560 /* turn off idle balance on this domain */ 8525 /* turn off idle balance on this domain */
8561 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); 8526 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8562 } else { 8527 } else {
8563 /* turn on idle balance on this domain */ 8528 /* turn on idle balance on this domain */
8564 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); 8529 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8565 } 8530 }
8566} 8531}
8567 8532
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2eb5b934715..09d19f77eb3a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1062,83 +1062,6 @@ static void yield_task_fair(struct rq *rq)
1062 se->vruntime = rightmost->vruntime + 1; 1062 se->vruntime = rightmost->vruntime + 1;
1063} 1063}
1064 1064
1065/*
1066 * wake_idle() will wake a task on an idle cpu if task->cpu is
1067 * not idle and an idle cpu is available. The span of cpus to
1068 * search starts with cpus closest then further out as needed,
1069 * so we always favor a closer, idle cpu.
1070 * Domains may include CPUs that are not usable for migration,
1071 * hence we need to mask them out (rq->rd->online)
1072 *
1073 * Returns the CPU we should wake onto.
1074 */
1075#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1076
1077#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
1078
1079static int wake_idle(int cpu, struct task_struct *p)
1080{
1081 struct sched_domain *sd;
1082 int i;
1083 unsigned int chosen_wakeup_cpu;
1084 int this_cpu;
1085 struct rq *task_rq = task_rq(p);
1086
1087 /*
1088 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
1089 * are idle and this is not a kernel thread and this task's affinity
1090 * allows it to be moved to preferred cpu, then just move!
1091 */
1092
1093 this_cpu = smp_processor_id();
1094 chosen_wakeup_cpu =
1095 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
1096
1097 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
1098 idle_cpu(cpu) && idle_cpu(this_cpu) &&
1099 p->mm && !(p->flags & PF_KTHREAD) &&
1100 cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
1101 return chosen_wakeup_cpu;
1102
1103 /*
1104 * If it is idle, then it is the best cpu to run this task.
1105 *
1106 * This cpu is also the best, if it has more than one task already.
1107 * Siblings must be also busy(in most cases) as they didn't already
1108 * pickup the extra load from this cpu and hence we need not check
1109 * sibling runqueue info. This will avoid the checks and cache miss
1110 * penalities associated with that.
1111 */
1112 if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
1113 return cpu;
1114
1115 for_each_domain(cpu, sd) {
1116 if ((sd->flags & SD_WAKE_IDLE)
1117 || ((sd->flags & SD_WAKE_IDLE_FAR)
1118 && !task_hot(p, task_rq->clock, sd))) {
1119 for_each_cpu_and(i, sched_domain_span(sd),
1120 &p->cpus_allowed) {
1121 if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
1122 if (i != task_cpu(p)) {
1123 schedstat_inc(p,
1124 se.nr_wakeups_idle);
1125 }
1126 return i;
1127 }
1128 }
1129 } else {
1130 break;
1131 }
1132 }
1133 return cpu;
1134}
1135#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
1136static inline int wake_idle(int cpu, struct task_struct *p)
1137{
1138 return cpu;
1139}
1140#endif
1141
1142#ifdef CONFIG_SMP 1065#ifdef CONFIG_SMP
1143 1066
1144#ifdef CONFIG_FAIR_GROUP_SCHED 1067#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1225,21 +1148,22 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1225 1148
1226#endif 1149#endif
1227 1150
1228static int 1151static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1229wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1230 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1231 int idx, unsigned long load, unsigned long this_load,
1232 unsigned int imbalance)
1233{ 1152{
1234 struct task_struct *curr = this_rq->curr; 1153 struct task_struct *curr = current;
1235 struct task_group *tg; 1154 unsigned long this_load, load;
1236 unsigned long tl = this_load; 1155 int idx, this_cpu, prev_cpu;
1237 unsigned long tl_per_task; 1156 unsigned long tl_per_task;
1157 unsigned int imbalance;
1158 struct task_group *tg;
1238 unsigned long weight; 1159 unsigned long weight;
1239 int balanced; 1160 int balanced;
1240 1161
1241 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1162 idx = sd->wake_idx;
1242 return 0; 1163 this_cpu = smp_processor_id();
1164 prev_cpu = task_cpu(p);
1165 load = source_load(prev_cpu, idx);
1166 this_load = target_load(this_cpu, idx);
1243 1167
1244 if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || 1168 if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1245 p->se.avg_overlap > sysctl_sched_migration_cost)) 1169 p->se.avg_overlap > sysctl_sched_migration_cost))
@@ -1254,24 +1178,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1254 tg = task_group(current); 1178 tg = task_group(current);
1255 weight = current->se.load.weight; 1179 weight = current->se.load.weight;
1256 1180
1257 tl += effective_load(tg, this_cpu, -weight, -weight); 1181 this_load += effective_load(tg, this_cpu, -weight, -weight);
1258 load += effective_load(tg, prev_cpu, 0, -weight); 1182 load += effective_load(tg, prev_cpu, 0, -weight);
1259 } 1183 }
1260 1184
1261 tg = task_group(p); 1185 tg = task_group(p);
1262 weight = p->se.load.weight; 1186 weight = p->se.load.weight;
1263 1187
1188 imbalance = 100 + (sd->imbalance_pct - 100) / 2;
1189
1264 /* 1190 /*
1265 * In low-load situations, where prev_cpu is idle and this_cpu is idle 1191 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1266 * due to the sync cause above having dropped tl to 0, we'll always have 1192 * due to the sync cause above having dropped this_load to 0, we'll
1267 * an imbalance, but there's really nothing you can do about that, so 1193 * always have an imbalance, but there's really nothing you can do
1268 * that's good too. 1194 * about that, so that's good too.
1269 * 1195 *
1270 * Otherwise check if either cpus are near enough in load to allow this 1196 * Otherwise check if either cpus are near enough in load to allow this
1271 * task to be woken on this_cpu. 1197 * task to be woken on this_cpu.
1272 */ 1198 */
1273 balanced = !tl || 1199 balanced = !this_load ||
1274 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= 1200 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
1275 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1201 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1276 1202
1277 /* 1203 /*
@@ -1285,14 +1211,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1285 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1211 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1286 tl_per_task = cpu_avg_load_per_task(this_cpu); 1212 tl_per_task = cpu_avg_load_per_task(this_cpu);
1287 1213
1288 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= 1214 if (balanced ||
1289 tl_per_task)) { 1215 (this_load <= load &&
1216 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
1290 /* 1217 /*
1291 * This domain has SD_WAKE_AFFINE and 1218 * This domain has SD_WAKE_AFFINE and
1292 * p is cache cold in this domain, and 1219 * p is cache cold in this domain, and
1293 * there is no bad imbalance. 1220 * there is no bad imbalance.
1294 */ 1221 */
1295 schedstat_inc(this_sd, ttwu_move_affine); 1222 schedstat_inc(sd, ttwu_move_affine);
1296 schedstat_inc(p, se.nr_wakeups_affine); 1223 schedstat_inc(p, se.nr_wakeups_affine);
1297 1224
1298 return 1; 1225 return 1;
@@ -1300,72 +1227,6 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1300 return 0; 1227 return 0;
1301} 1228}
1302 1229
1303static int sched_balance_self(int cpu, int flag);
1304
1305static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
1306{
1307 struct sched_domain *sd, *this_sd = NULL;
1308 int prev_cpu, this_cpu, new_cpu;
1309 unsigned long load, this_load;
1310 struct rq *this_rq;
1311 unsigned int imbalance;
1312 int idx;
1313
1314 prev_cpu = task_cpu(p);
1315 this_cpu = smp_processor_id();
1316 this_rq = cpu_rq(this_cpu);
1317 new_cpu = prev_cpu;
1318
1319 if (flag != SD_BALANCE_WAKE)
1320 return sched_balance_self(this_cpu, flag);
1321
1322 /*
1323 * 'this_sd' is the first domain that both
1324 * this_cpu and prev_cpu are present in:
1325 */
1326 for_each_domain(this_cpu, sd) {
1327 if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
1328 this_sd = sd;
1329 break;
1330 }
1331 }
1332
1333 if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
1334 goto out;
1335
1336 /*
1337 * Check for affine wakeup and passive balancing possibilities.
1338 */
1339 if (!this_sd)
1340 goto out;
1341
1342 idx = this_sd->wake_idx;
1343
1344 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1345
1346 load = source_load(prev_cpu, idx);
1347 this_load = target_load(this_cpu, idx);
1348
1349 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
1350 load, this_load, imbalance))
1351 return this_cpu;
1352
1353 /*
1354 * Start passive balancing when half the imbalance_pct
1355 * limit is reached.
1356 */
1357 if (this_sd->flags & SD_WAKE_BALANCE) {
1358 if (imbalance*this_load <= 100*load) {
1359 schedstat_inc(this_sd, ttwu_move_balance);
1360 schedstat_inc(p, se.nr_wakeups_passive);
1361 return this_cpu;
1362 }
1363 }
1364
1365out:
1366 return wake_idle(new_cpu, p);
1367}
1368
1369/* 1230/*
1370 * find_idlest_group finds and returns the least busy CPU group within the 1231 * find_idlest_group finds and returns the least busy CPU group within the
1371 * domain. 1232 * domain.
@@ -1455,10 +1316,20 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1455 * 1316 *
1456 * preempt must be disabled. 1317 * preempt must be disabled.
1457 */ 1318 */
1458static int sched_balance_self(int cpu, int flag) 1319static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
1459{ 1320{
1460 struct task_struct *t = current; 1321 struct task_struct *t = current;
1461 struct sched_domain *tmp, *sd = NULL; 1322 struct sched_domain *tmp, *sd = NULL;
1323 int cpu = smp_processor_id();
1324 int prev_cpu = task_cpu(p);
1325 int new_cpu = cpu;
1326 int want_affine = 0;
1327
1328 if (flag & SD_BALANCE_WAKE) {
1329 if (sched_feat(AFFINE_WAKEUPS))
1330 want_affine = 1;
1331 new_cpu = prev_cpu;
1332 }
1462 1333
1463 for_each_domain(cpu, tmp) { 1334 for_each_domain(cpu, tmp) {
1464 /* 1335 /*
@@ -1466,16 +1337,38 @@ static int sched_balance_self(int cpu, int flag)
1466 */ 1337 */
1467 if (tmp->flags & SD_POWERSAVINGS_BALANCE) 1338 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1468 break; 1339 break;
1469 if (tmp->flags & flag)
1470 sd = tmp;
1471 }
1472 1340
1473 if (sd) 1341 switch (flag) {
1474 update_shares(sd); 1342 case SD_BALANCE_WAKE:
1343 if (!sched_feat(LB_WAKEUP_UPDATE))
1344 break;
1345 case SD_BALANCE_FORK:
1346 case SD_BALANCE_EXEC:
1347 if (root_task_group_empty())
1348 break;
1349 update_shares(tmp);
1350 default:
1351 break;
1352 }
1353
1354 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1355 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1356
1357 if (wake_affine(tmp, p, sync))
1358 return cpu;
1359
1360 want_affine = 0;
1361 }
1362
1363 if (!(tmp->flags & flag))
1364 continue;
1365
1366 sd = tmp;
1367 }
1475 1368
1476 while (sd) { 1369 while (sd) {
1477 struct sched_group *group; 1370 struct sched_group *group;
1478 int new_cpu, weight; 1371 int weight;
1479 1372
1480 if (!(sd->flags & flag)) { 1373 if (!(sd->flags & flag)) {
1481 sd = sd->child; 1374 sd = sd->child;
@@ -1508,7 +1401,7 @@ static int sched_balance_self(int cpu, int flag)
1508 /* while loop will break here if sd == NULL */ 1401 /* while loop will break here if sd == NULL */
1509 } 1402 }
1510 1403
1511 return cpu; 1404 return new_cpu;
1512} 1405}
1513#endif /* CONFIG_SMP */ 1406#endif /* CONFIG_SMP */
1514 1407