aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2009-09-10 07:50:02 -0400
committerIngo Molnar <mingo@elte.hu>2009-09-15 10:01:05 -0400
commitc88d5910890ad35af283344417891344604f0438 (patch)
tree4e2025d569c3e03a7ec5163f0a9bc159114ee14e /kernel/sched_fair.c
parente9c8431185d6c406887190519f6dbdd112641686 (diff)
sched: Merge select_task_rq_fair() and sched_balance_self()
The problem with wake_idle() is that is doesn't respect things like cpu_power, which means it doesn't deal well with SMT nor the recent RT interaction. To cure this, it needs to do what sched_balance_self() does, which leads to the possibility of merging select_task_rq_fair() and sched_balance_self(). Modify sched_balance_self() to: - update_shares() when walking up the domain tree, (it only called it for the top domain, but it should have done this anyway), which allows us to remove this ugly bit from try_to_wake_up(). - do wake_affine() on the smallest domain that contains both this (the waking) and the prev (the wakee) cpu for WAKE invocations. Then use the top-down balance steps it had to replace wake_idle(). This leads to the dissapearance of SD_WAKE_BALANCE and SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE. SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective. Touch all topology bits to replace the old with new SD flags -- platforms might need re-tuning, enabling SD_BALANCE_WAKE conditionally on a NUMA distance seems like a good additional feature, magny-core and small nehalem systems would want this enabled, systems with slow interconnects would not. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c233
1 files changed, 63 insertions, 170 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2eb5b934715..09d19f77eb3a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1062,83 +1062,6 @@ static void yield_task_fair(struct rq *rq)
1062 se->vruntime = rightmost->vruntime + 1; 1062 se->vruntime = rightmost->vruntime + 1;
1063} 1063}
1064 1064
1065/*
1066 * wake_idle() will wake a task on an idle cpu if task->cpu is
1067 * not idle and an idle cpu is available. The span of cpus to
1068 * search starts with cpus closest then further out as needed,
1069 * so we always favor a closer, idle cpu.
1070 * Domains may include CPUs that are not usable for migration,
1071 * hence we need to mask them out (rq->rd->online)
1072 *
1073 * Returns the CPU we should wake onto.
1074 */
1075#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1076
1077#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
1078
1079static int wake_idle(int cpu, struct task_struct *p)
1080{
1081 struct sched_domain *sd;
1082 int i;
1083 unsigned int chosen_wakeup_cpu;
1084 int this_cpu;
1085 struct rq *task_rq = task_rq(p);
1086
1087 /*
1088 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
1089 * are idle and this is not a kernel thread and this task's affinity
1090 * allows it to be moved to preferred cpu, then just move!
1091 */
1092
1093 this_cpu = smp_processor_id();
1094 chosen_wakeup_cpu =
1095 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
1096
1097 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
1098 idle_cpu(cpu) && idle_cpu(this_cpu) &&
1099 p->mm && !(p->flags & PF_KTHREAD) &&
1100 cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
1101 return chosen_wakeup_cpu;
1102
1103 /*
1104 * If it is idle, then it is the best cpu to run this task.
1105 *
1106 * This cpu is also the best, if it has more than one task already.
1107 * Siblings must be also busy(in most cases) as they didn't already
1108 * pickup the extra load from this cpu and hence we need not check
1109 * sibling runqueue info. This will avoid the checks and cache miss
1110 * penalities associated with that.
1111 */
1112 if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
1113 return cpu;
1114
1115 for_each_domain(cpu, sd) {
1116 if ((sd->flags & SD_WAKE_IDLE)
1117 || ((sd->flags & SD_WAKE_IDLE_FAR)
1118 && !task_hot(p, task_rq->clock, sd))) {
1119 for_each_cpu_and(i, sched_domain_span(sd),
1120 &p->cpus_allowed) {
1121 if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
1122 if (i != task_cpu(p)) {
1123 schedstat_inc(p,
1124 se.nr_wakeups_idle);
1125 }
1126 return i;
1127 }
1128 }
1129 } else {
1130 break;
1131 }
1132 }
1133 return cpu;
1134}
1135#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
1136static inline int wake_idle(int cpu, struct task_struct *p)
1137{
1138 return cpu;
1139}
1140#endif
1141
1142#ifdef CONFIG_SMP 1065#ifdef CONFIG_SMP
1143 1066
1144#ifdef CONFIG_FAIR_GROUP_SCHED 1067#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1225,21 +1148,22 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1225 1148
1226#endif 1149#endif
1227 1150
1228static int 1151static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1229wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1230 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1231 int idx, unsigned long load, unsigned long this_load,
1232 unsigned int imbalance)
1233{ 1152{
1234 struct task_struct *curr = this_rq->curr; 1153 struct task_struct *curr = current;
1235 struct task_group *tg; 1154 unsigned long this_load, load;
1236 unsigned long tl = this_load; 1155 int idx, this_cpu, prev_cpu;
1237 unsigned long tl_per_task; 1156 unsigned long tl_per_task;
1157 unsigned int imbalance;
1158 struct task_group *tg;
1238 unsigned long weight; 1159 unsigned long weight;
1239 int balanced; 1160 int balanced;
1240 1161
1241 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1162 idx = sd->wake_idx;
1242 return 0; 1163 this_cpu = smp_processor_id();
1164 prev_cpu = task_cpu(p);
1165 load = source_load(prev_cpu, idx);
1166 this_load = target_load(this_cpu, idx);
1243 1167
1244 if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || 1168 if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1245 p->se.avg_overlap > sysctl_sched_migration_cost)) 1169 p->se.avg_overlap > sysctl_sched_migration_cost))
@@ -1254,24 +1178,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1254 tg = task_group(current); 1178 tg = task_group(current);
1255 weight = current->se.load.weight; 1179 weight = current->se.load.weight;
1256 1180
1257 tl += effective_load(tg, this_cpu, -weight, -weight); 1181 this_load += effective_load(tg, this_cpu, -weight, -weight);
1258 load += effective_load(tg, prev_cpu, 0, -weight); 1182 load += effective_load(tg, prev_cpu, 0, -weight);
1259 } 1183 }
1260 1184
1261 tg = task_group(p); 1185 tg = task_group(p);
1262 weight = p->se.load.weight; 1186 weight = p->se.load.weight;
1263 1187
1188 imbalance = 100 + (sd->imbalance_pct - 100) / 2;
1189
1264 /* 1190 /*
1265 * In low-load situations, where prev_cpu is idle and this_cpu is idle 1191 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1266 * due to the sync cause above having dropped tl to 0, we'll always have 1192 * due to the sync cause above having dropped this_load to 0, we'll
1267 * an imbalance, but there's really nothing you can do about that, so 1193 * always have an imbalance, but there's really nothing you can do
1268 * that's good too. 1194 * about that, so that's good too.
1269 * 1195 *
1270 * Otherwise check if either cpus are near enough in load to allow this 1196 * Otherwise check if either cpus are near enough in load to allow this
1271 * task to be woken on this_cpu. 1197 * task to be woken on this_cpu.
1272 */ 1198 */
1273 balanced = !tl || 1199 balanced = !this_load ||
1274 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= 1200 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
1275 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1201 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1276 1202
1277 /* 1203 /*
@@ -1285,14 +1211,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1285 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1211 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1286 tl_per_task = cpu_avg_load_per_task(this_cpu); 1212 tl_per_task = cpu_avg_load_per_task(this_cpu);
1287 1213
1288 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= 1214 if (balanced ||
1289 tl_per_task)) { 1215 (this_load <= load &&
1216 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
1290 /* 1217 /*
1291 * This domain has SD_WAKE_AFFINE and 1218 * This domain has SD_WAKE_AFFINE and
1292 * p is cache cold in this domain, and 1219 * p is cache cold in this domain, and
1293 * there is no bad imbalance. 1220 * there is no bad imbalance.
1294 */ 1221 */
1295 schedstat_inc(this_sd, ttwu_move_affine); 1222 schedstat_inc(sd, ttwu_move_affine);
1296 schedstat_inc(p, se.nr_wakeups_affine); 1223 schedstat_inc(p, se.nr_wakeups_affine);
1297 1224
1298 return 1; 1225 return 1;
@@ -1300,72 +1227,6 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1300 return 0; 1227 return 0;
1301} 1228}
1302 1229
1303static int sched_balance_self(int cpu, int flag);
1304
1305static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
1306{
1307 struct sched_domain *sd, *this_sd = NULL;
1308 int prev_cpu, this_cpu, new_cpu;
1309 unsigned long load, this_load;
1310 struct rq *this_rq;
1311 unsigned int imbalance;
1312 int idx;
1313
1314 prev_cpu = task_cpu(p);
1315 this_cpu = smp_processor_id();
1316 this_rq = cpu_rq(this_cpu);
1317 new_cpu = prev_cpu;
1318
1319 if (flag != SD_BALANCE_WAKE)
1320 return sched_balance_self(this_cpu, flag);
1321
1322 /*
1323 * 'this_sd' is the first domain that both
1324 * this_cpu and prev_cpu are present in:
1325 */
1326 for_each_domain(this_cpu, sd) {
1327 if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
1328 this_sd = sd;
1329 break;
1330 }
1331 }
1332
1333 if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
1334 goto out;
1335
1336 /*
1337 * Check for affine wakeup and passive balancing possibilities.
1338 */
1339 if (!this_sd)
1340 goto out;
1341
1342 idx = this_sd->wake_idx;
1343
1344 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1345
1346 load = source_load(prev_cpu, idx);
1347 this_load = target_load(this_cpu, idx);
1348
1349 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
1350 load, this_load, imbalance))
1351 return this_cpu;
1352
1353 /*
1354 * Start passive balancing when half the imbalance_pct
1355 * limit is reached.
1356 */
1357 if (this_sd->flags & SD_WAKE_BALANCE) {
1358 if (imbalance*this_load <= 100*load) {
1359 schedstat_inc(this_sd, ttwu_move_balance);
1360 schedstat_inc(p, se.nr_wakeups_passive);
1361 return this_cpu;
1362 }
1363 }
1364
1365out:
1366 return wake_idle(new_cpu, p);
1367}
1368
1369/* 1230/*
1370 * find_idlest_group finds and returns the least busy CPU group within the 1231 * find_idlest_group finds and returns the least busy CPU group within the
1371 * domain. 1232 * domain.
@@ -1455,10 +1316,20 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1455 * 1316 *
1456 * preempt must be disabled. 1317 * preempt must be disabled.
1457 */ 1318 */
1458static int sched_balance_self(int cpu, int flag) 1319static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
1459{ 1320{
1460 struct task_struct *t = current; 1321 struct task_struct *t = current;
1461 struct sched_domain *tmp, *sd = NULL; 1322 struct sched_domain *tmp, *sd = NULL;
1323 int cpu = smp_processor_id();
1324 int prev_cpu = task_cpu(p);
1325 int new_cpu = cpu;
1326 int want_affine = 0;
1327
1328 if (flag & SD_BALANCE_WAKE) {
1329 if (sched_feat(AFFINE_WAKEUPS))
1330 want_affine = 1;
1331 new_cpu = prev_cpu;
1332 }
1462 1333
1463 for_each_domain(cpu, tmp) { 1334 for_each_domain(cpu, tmp) {
1464 /* 1335 /*
@@ -1466,16 +1337,38 @@ static int sched_balance_self(int cpu, int flag)
1466 */ 1337 */
1467 if (tmp->flags & SD_POWERSAVINGS_BALANCE) 1338 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1468 break; 1339 break;
1469 if (tmp->flags & flag)
1470 sd = tmp;
1471 }
1472 1340
1473 if (sd) 1341 switch (flag) {
1474 update_shares(sd); 1342 case SD_BALANCE_WAKE:
1343 if (!sched_feat(LB_WAKEUP_UPDATE))
1344 break;
1345 case SD_BALANCE_FORK:
1346 case SD_BALANCE_EXEC:
1347 if (root_task_group_empty())
1348 break;
1349 update_shares(tmp);
1350 default:
1351 break;
1352 }
1353
1354 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1355 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1356
1357 if (wake_affine(tmp, p, sync))
1358 return cpu;
1359
1360 want_affine = 0;
1361 }
1362
1363 if (!(tmp->flags & flag))
1364 continue;
1365
1366 sd = tmp;
1367 }
1475 1368
1476 while (sd) { 1369 while (sd) {
1477 struct sched_group *group; 1370 struct sched_group *group;
1478 int new_cpu, weight; 1371 int weight;
1479 1372
1480 if (!(sd->flags & flag)) { 1373 if (!(sd->flags & flag)) {
1481 sd = sd->child; 1374 sd = sd->child;
@@ -1508,7 +1401,7 @@ static int sched_balance_self(int cpu, int flag)
1508 /* while loop will break here if sd == NULL */ 1401 /* while loop will break here if sd == NULL */
1509 } 1402 }
1510 1403
1511 return cpu; 1404 return new_cpu;
1512} 1405}
1513#endif /* CONFIG_SMP */ 1406#endif /* CONFIG_SMP */
1514 1407