aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c234
1 files changed, 58 insertions, 176 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fb8994c6d4bb..18fd17172eb6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -409,64 +409,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
409} 409}
410 410
411/* 411/*
412 * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
413 * that it favours >=0 over <0.
414 *
415 * -20 |
416 * |
417 * 0 --------+-------
418 * .'
419 * 19 .'
420 *
421 */
422static unsigned long
423calc_delta_asym(unsigned long delta, struct sched_entity *se)
424{
425 struct load_weight lw = {
426 .weight = NICE_0_LOAD,
427 .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
428 };
429
430 for_each_sched_entity(se) {
431 struct load_weight *se_lw = &se->load;
432 unsigned long rw = cfs_rq_of(se)->load.weight;
433
434#ifdef CONFIG_FAIR_SCHED_GROUP
435 struct cfs_rq *cfs_rq = se->my_q;
436 struct task_group *tg = NULL
437
438 if (cfs_rq)
439 tg = cfs_rq->tg;
440
441 if (tg && tg->shares < NICE_0_LOAD) {
442 /*
443 * scale shares to what it would have been had
444 * tg->weight been NICE_0_LOAD:
445 *
446 * weight = 1024 * shares / tg->weight
447 */
448 lw.weight *= se->load.weight;
449 lw.weight /= tg->shares;
450
451 lw.inv_weight = 0;
452
453 se_lw = &lw;
454 rw += lw.weight - se->load.weight;
455 } else
456#endif
457
458 if (se->load.weight < NICE_0_LOAD) {
459 se_lw = &lw;
460 rw += NICE_0_LOAD - se->load.weight;
461 }
462
463 delta = calc_delta_mine(delta, rw, se_lw);
464 }
465
466 return delta;
467}
468
469/*
470 * Update the current task's runtime statistics. Skip current tasks that 412 * Update the current task's runtime statistics. Skip current tasks that
471 * are not in our scheduling class. 413 * are not in our scheduling class.
472 */ 414 */
@@ -586,11 +528,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
586 update_load_add(&cfs_rq->load, se->load.weight); 528 update_load_add(&cfs_rq->load, se->load.weight);
587 if (!parent_entity(se)) 529 if (!parent_entity(se))
588 inc_cpu_load(rq_of(cfs_rq), se->load.weight); 530 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
589 if (entity_is_task(se)) 531 if (entity_is_task(se)) {
590 add_cfs_task_weight(cfs_rq, se->load.weight); 532 add_cfs_task_weight(cfs_rq, se->load.weight);
533 list_add(&se->group_node, &cfs_rq->tasks);
534 }
591 cfs_rq->nr_running++; 535 cfs_rq->nr_running++;
592 se->on_rq = 1; 536 se->on_rq = 1;
593 list_add(&se->group_node, &cfs_rq->tasks);
594} 537}
595 538
596static void 539static void
@@ -599,11 +542,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
599 update_load_sub(&cfs_rq->load, se->load.weight); 542 update_load_sub(&cfs_rq->load, se->load.weight);
600 if (!parent_entity(se)) 543 if (!parent_entity(se))
601 dec_cpu_load(rq_of(cfs_rq), se->load.weight); 544 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
602 if (entity_is_task(se)) 545 if (entity_is_task(se)) {
603 add_cfs_task_weight(cfs_rq, -se->load.weight); 546 add_cfs_task_weight(cfs_rq, -se->load.weight);
547 list_del_init(&se->group_node);
548 }
604 cfs_rq->nr_running--; 549 cfs_rq->nr_running--;
605 se->on_rq = 0; 550 se->on_rq = 0;
606 list_del_init(&se->group_node);
607} 551}
608 552
609static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 553static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1085,7 +1029,6 @@ static long effective_load(struct task_group *tg, int cpu,
1085 long wl, long wg) 1029 long wl, long wg)
1086{ 1030{
1087 struct sched_entity *se = tg->se[cpu]; 1031 struct sched_entity *se = tg->se[cpu];
1088 long more_w;
1089 1032
1090 if (!tg->parent) 1033 if (!tg->parent)
1091 return wl; 1034 return wl;
@@ -1097,18 +1040,17 @@ static long effective_load(struct task_group *tg, int cpu,
1097 if (!wl && sched_feat(ASYM_EFF_LOAD)) 1040 if (!wl && sched_feat(ASYM_EFF_LOAD))
1098 return wl; 1041 return wl;
1099 1042
1100 /*
1101 * Instead of using this increment, also add the difference
1102 * between when the shares were last updated and now.
1103 */
1104 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1105 wl += more_w;
1106 wg += more_w;
1107
1108 for_each_sched_entity(se) { 1043 for_each_sched_entity(se) {
1109#define D(n) (likely(n) ? (n) : 1)
1110
1111 long S, rw, s, a, b; 1044 long S, rw, s, a, b;
1045 long more_w;
1046
1047 /*
1048 * Instead of using this increment, also add the difference
1049 * between when the shares were last updated and now.
1050 */
1051 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1052 wl += more_w;
1053 wg += more_w;
1112 1054
1113 S = se->my_q->tg->shares; 1055 S = se->my_q->tg->shares;
1114 s = se->my_q->shares; 1056 s = se->my_q->shares;
@@ -1117,7 +1059,11 @@ static long effective_load(struct task_group *tg, int cpu,
1117 a = S*(rw + wl); 1059 a = S*(rw + wl);
1118 b = S*rw + s*wg; 1060 b = S*rw + s*wg;
1119 1061
1120 wl = s*(a-b)/D(b); 1062 wl = s*(a-b);
1063
1064 if (likely(b))
1065 wl /= b;
1066
1121 /* 1067 /*
1122 * Assume the group is already running and will 1068 * Assume the group is already running and will
1123 * thus already be accounted for in the weight. 1069 * thus already be accounted for in the weight.
@@ -1126,7 +1072,6 @@ static long effective_load(struct task_group *tg, int cpu,
1126 * alter the group weight. 1072 * alter the group weight.
1127 */ 1073 */
1128 wg = 0; 1074 wg = 0;
1129#undef D
1130 } 1075 }
1131 1076
1132 return wl; 1077 return wl;
@@ -1143,7 +1088,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1143#endif 1088#endif
1144 1089
1145static int 1090static int
1146wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, 1091wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1147 struct task_struct *p, int prev_cpu, int this_cpu, int sync, 1092 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1148 int idx, unsigned long load, unsigned long this_load, 1093 int idx, unsigned long load, unsigned long this_load,
1149 unsigned int imbalance) 1094 unsigned int imbalance)
@@ -1158,6 +1103,11 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1158 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1103 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
1159 return 0; 1104 return 0;
1160 1105
1106 if (!sync && sched_feat(SYNC_WAKEUPS) &&
1107 curr->se.avg_overlap < sysctl_sched_migration_cost &&
1108 p->se.avg_overlap < sysctl_sched_migration_cost)
1109 sync = 1;
1110
1161 /* 1111 /*
1162 * If sync wakeup then subtract the (maximum possible) 1112 * If sync wakeup then subtract the (maximum possible)
1163 * effect of the currently running task from the load 1113 * effect of the currently running task from the load
@@ -1182,17 +1132,14 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1182 * a reasonable amount of time then attract this newly 1132 * a reasonable amount of time then attract this newly
1183 * woken task: 1133 * woken task:
1184 */ 1134 */
1185 if (sync && balanced) { 1135 if (sync && balanced)
1186 if (curr->se.avg_overlap < sysctl_sched_migration_cost && 1136 return 1;
1187 p->se.avg_overlap < sysctl_sched_migration_cost)
1188 return 1;
1189 }
1190 1137
1191 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1138 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1192 tl_per_task = cpu_avg_load_per_task(this_cpu); 1139 tl_per_task = cpu_avg_load_per_task(this_cpu);
1193 1140
1194 if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || 1141 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
1195 balanced) { 1142 tl_per_task)) {
1196 /* 1143 /*
1197 * This domain has SD_WAKE_AFFINE and 1144 * This domain has SD_WAKE_AFFINE and
1198 * p is cache cold in this domain, and 1145 * p is cache cold in this domain, and
@@ -1211,16 +1158,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1211 struct sched_domain *sd, *this_sd = NULL; 1158 struct sched_domain *sd, *this_sd = NULL;
1212 int prev_cpu, this_cpu, new_cpu; 1159 int prev_cpu, this_cpu, new_cpu;
1213 unsigned long load, this_load; 1160 unsigned long load, this_load;
1214 struct rq *rq, *this_rq; 1161 struct rq *this_rq;
1215 unsigned int imbalance; 1162 unsigned int imbalance;
1216 int idx; 1163 int idx;
1217 1164
1218 prev_cpu = task_cpu(p); 1165 prev_cpu = task_cpu(p);
1219 rq = task_rq(p);
1220 this_cpu = smp_processor_id(); 1166 this_cpu = smp_processor_id();
1221 this_rq = cpu_rq(this_cpu); 1167 this_rq = cpu_rq(this_cpu);
1222 new_cpu = prev_cpu; 1168 new_cpu = prev_cpu;
1223 1169
1170 if (prev_cpu == this_cpu)
1171 goto out;
1224 /* 1172 /*
1225 * 'this_sd' is the first domain that both 1173 * 'this_sd' is the first domain that both
1226 * this_cpu and prev_cpu are present in: 1174 * this_cpu and prev_cpu are present in:
@@ -1248,13 +1196,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1248 load = source_load(prev_cpu, idx); 1196 load = source_load(prev_cpu, idx);
1249 this_load = target_load(this_cpu, idx); 1197 this_load = target_load(this_cpu, idx);
1250 1198
1251 if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, 1199 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
1252 load, this_load, imbalance)) 1200 load, this_load, imbalance))
1253 return this_cpu; 1201 return this_cpu;
1254 1202
1255 if (prev_cpu == this_cpu)
1256 goto out;
1257
1258 /* 1203 /*
1259 * Start passive balancing when half the imbalance_pct 1204 * Start passive balancing when half the imbalance_pct
1260 * limit is reached. 1205 * limit is reached.
@@ -1281,62 +1226,20 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1281 * + nice tasks. 1226 * + nice tasks.
1282 */ 1227 */
1283 if (sched_feat(ASYM_GRAN)) 1228 if (sched_feat(ASYM_GRAN))
1284 gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); 1229 gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
1285 else
1286 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
1287 1230
1288 return gran; 1231 return gran;
1289} 1232}
1290 1233
1291/* 1234/*
1292 * Should 'se' preempt 'curr'.
1293 *
1294 * |s1
1295 * |s2
1296 * |s3
1297 * g
1298 * |<--->|c
1299 *
1300 * w(c, s1) = -1
1301 * w(c, s2) = 0
1302 * w(c, s3) = 1
1303 *
1304 */
1305static int
1306wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1307{
1308 s64 gran, vdiff = curr->vruntime - se->vruntime;
1309
1310 if (vdiff < 0)
1311 return -1;
1312
1313 gran = wakeup_gran(curr);
1314 if (vdiff > gran)
1315 return 1;
1316
1317 return 0;
1318}
1319
1320/* return depth at which a sched entity is present in the hierarchy */
1321static inline int depth_se(struct sched_entity *se)
1322{
1323 int depth = 0;
1324
1325 for_each_sched_entity(se)
1326 depth++;
1327
1328 return depth;
1329}
1330
1331/*
1332 * Preempt the current task with a newly woken task if needed: 1235 * Preempt the current task with a newly woken task if needed:
1333 */ 1236 */
1334static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) 1237static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1335{ 1238{
1336 struct task_struct *curr = rq->curr; 1239 struct task_struct *curr = rq->curr;
1337 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1240 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1338 struct sched_entity *se = &curr->se, *pse = &p->se; 1241 struct sched_entity *se = &curr->se, *pse = &p->se;
1339 int se_depth, pse_depth; 1242 s64 delta_exec;
1340 1243
1341 if (unlikely(rt_prio(p->prio))) { 1244 if (unlikely(rt_prio(p->prio))) {
1342 update_rq_clock(rq); 1245 update_rq_clock(rq);
@@ -1351,6 +1254,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1351 cfs_rq_of(pse)->next = pse; 1254 cfs_rq_of(pse)->next = pse;
1352 1255
1353 /* 1256 /*
1257 * We can come here with TIF_NEED_RESCHED already set from new task
1258 * wake up path.
1259 */
1260 if (test_tsk_need_resched(curr))
1261 return;
1262
1263 /*
1354 * Batch tasks do not preempt (their preemption is driven by 1264 * Batch tasks do not preempt (their preemption is driven by
1355 * the tick): 1265 * the tick):
1356 */ 1266 */
@@ -1360,33 +1270,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1360 if (!sched_feat(WAKEUP_PREEMPT)) 1270 if (!sched_feat(WAKEUP_PREEMPT))
1361 return; 1271 return;
1362 1272
1363 /* 1273 if (sched_feat(WAKEUP_OVERLAP) && (sync ||
1364 * preemption test can be made between sibling entities who are in the 1274 (se->avg_overlap < sysctl_sched_migration_cost &&
1365 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of 1275 pse->avg_overlap < sysctl_sched_migration_cost))) {
1366 * both tasks until we find their ancestors who are siblings of common 1276 resched_task(curr);
1367 * parent. 1277 return;
1368 */
1369
1370 /* First walk up until both entities are at same depth */
1371 se_depth = depth_se(se);
1372 pse_depth = depth_se(pse);
1373
1374 while (se_depth > pse_depth) {
1375 se_depth--;
1376 se = parent_entity(se);
1377 }
1378
1379 while (pse_depth > se_depth) {
1380 pse_depth--;
1381 pse = parent_entity(pse);
1382 }
1383
1384 while (!is_same_group(se, pse)) {
1385 se = parent_entity(se);
1386 pse = parent_entity(pse);
1387 } 1278 }
1388 1279
1389 if (wakeup_preempt_entity(se, pse) == 1) 1280 delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
1281 if (delta_exec > wakeup_gran(pse))
1390 resched_task(curr); 1282 resched_task(curr);
1391} 1283}
1392 1284
@@ -1445,19 +1337,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
1445 if (next == &cfs_rq->tasks) 1337 if (next == &cfs_rq->tasks)
1446 return NULL; 1338 return NULL;
1447 1339
1448 /* Skip over entities that are not tasks */ 1340 se = list_entry(next, struct sched_entity, group_node);
1449 do { 1341 p = task_of(se);
1450 se = list_entry(next, struct sched_entity, group_node); 1342 cfs_rq->balance_iterator = next->next;
1451 next = next->next;
1452 } while (next != &cfs_rq->tasks && !entity_is_task(se));
1453
1454 if (next == &cfs_rq->tasks)
1455 return NULL;
1456
1457 cfs_rq->balance_iterator = next;
1458
1459 if (entity_is_task(se))
1460 p = task_of(se);
1461 1343
1462 return p; 1344 return p;
1463} 1345}
@@ -1507,7 +1389,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1507 rcu_read_lock(); 1389 rcu_read_lock();
1508 update_h_load(busiest_cpu); 1390 update_h_load(busiest_cpu);
1509 1391
1510 list_for_each_entry(tg, &task_groups, list) { 1392 list_for_each_entry_rcu(tg, &task_groups, list) {
1511 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; 1393 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
1512 unsigned long busiest_h_load = busiest_cfs_rq->h_load; 1394 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
1513 unsigned long busiest_weight = busiest_cfs_rq->load.weight; 1395 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
@@ -1620,10 +1502,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1620 * 'current' within the tree based on its new key value. 1502 * 'current' within the tree based on its new key value.
1621 */ 1503 */
1622 swap(curr->vruntime, se->vruntime); 1504 swap(curr->vruntime, se->vruntime);
1505 resched_task(rq->curr);
1623 } 1506 }
1624 1507
1625 enqueue_task_fair(rq, p, 0); 1508 enqueue_task_fair(rq, p, 0);
1626 resched_task(rq->curr);
1627} 1509}
1628 1510
1629/* 1511/*
@@ -1642,7 +1524,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p,
1642 if (p->prio > oldprio) 1524 if (p->prio > oldprio)
1643 resched_task(rq->curr); 1525 resched_task(rq->curr);
1644 } else 1526 } else
1645 check_preempt_curr(rq, p); 1527 check_preempt_curr(rq, p, 0);
1646} 1528}
1647 1529
1648/* 1530/*
@@ -1659,7 +1541,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p,
1659 if (running) 1541 if (running)
1660 resched_task(rq->curr); 1542 resched_task(rq->curr);
1661 else 1543 else
1662 check_preempt_curr(rq, p); 1544 check_preempt_curr(rq, p, 0);
1663} 1545}
1664 1546
1665/* Account for a task changing its policy or group. 1547/* Account for a task changing its policy or group.